class Nokogiri::HTML::Document

Public Class Methods

parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML) { |options| ... } click to toggle source

Parse HTML. string_or_io may be a String, or any object that responds to read and close such as an IO, or StringIO. url is resource where this document is located. encoding is the encoding that should be used when processing the document. options is a number that sets options in the parser, such as Nokogiri::XML::ParseOptions::RECOVER. See the constants in Nokogiri::XML::ParseOptions.

# File lib/nokogiri/html/document.rb, line 166
def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options

  yield options if block_given?

  url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil

  if string_or_io.respond_to?(:encoding)
    unless string_or_io.encoding.name == "ASCII-8BIT"
      encoding ||= string_or_io.encoding.name
    end
  end

  if string_or_io.respond_to?(:read)
    if string_or_io.is_a?(Pathname)
      # resolve the Pathname to the file and open it as an IO object, see #2110
      string_or_io = string_or_io.expand_path.open
      url ||= string_or_io.path
    end

    unless encoding
      # Libxml2's parser has poor support for encoding
      # detection.  First, it does not recognize the HTML5
      # style meta charset declaration.  Secondly, even if it
      # successfully detects an encoding hint, it does not
      # re-decode or re-parse the preceding part which may be
      # garbled.
      #
      # EncodingReader aims to perform advanced encoding
      # detection beyond what Libxml2 does, and to emulate
      # rewinding of a stream and make Libxml2 redo parsing
      # from the start when an encoding hint is found.
      string_or_io = EncodingReader.new(string_or_io)
      begin
        return read_io(string_or_io, url, encoding, options.to_i)
      rescue EncodingFound => e
        encoding = e.found_encoding
      end
    end
    return read_io(string_or_io, url, encoding, options.to_i)
  end

  # read_memory pukes on empty docs
  if string_or_io.nil? or string_or_io.empty?
    return encoding ? new.tap { |i| i.encoding = encoding } : new
  end

  encoding ||= EncodingReader.detect_encoding(string_or_io)

  read_memory(string_or_io, url, encoding, options.to_i)
end

Public Instance Methods

fragment(tags = nil) click to toggle source

Create a Nokogiri::XML::DocumentFragment from tags

# File lib/nokogiri/html/document.rb, line 153
def fragment tags = nil
  DocumentFragment.new(self, tags, self.root)
end
meta_encoding() click to toggle source

Get the meta tag encoding for this document. If there is no meta tag, then nil is returned.

# File lib/nokogiri/html/document.rb, line 11
def meta_encoding
  case
  when meta = at('//meta[@charset]')
    meta[:charset]
  when meta = meta_content_type
    meta['content'][/charset\s*=\s*([\w-]+)/i, 1]
  end
end
meta_encoding=(encoding) click to toggle source

Set the meta tag encoding for this document.

If an meta encoding tag is already present, its content is replaced with the given text.

Otherwise, this method tries to create one at an appropriate place supplying head and/or html elements as necessary, which is inside a head element if any, and before any text node or content element (typically <body>) if any.

The result when trying to set an encoding that is different from the document encoding is undefined.

Beware in CRuby, that libxml2 automatically inserts a meta tag into a head element.

# File lib/nokogiri/html/document.rb, line 36
def meta_encoding= encoding
  case
  when meta = meta_content_type
    meta['content'] = 'text/html; charset=%s' % encoding
    encoding
  when meta = at('//meta[@charset]')
    meta['charset'] = encoding
  else
    meta = XML::Node.new('meta', self)
    if dtd = internal_subset and dtd.html5_dtd?
      meta['charset'] = encoding
    else
      meta['http-equiv'] = 'Content-Type'
      meta['content'] = 'text/html; charset=%s' % encoding
    end

    case
    when head = at('//head')
      head.prepend_child(meta)
    else
      set_metadata_element(meta)
    end
    encoding
  end
end
serialize(options = {}) click to toggle source

Serialize Node using options. Save options can also be set using a block. See SaveOptions.

These two statements are equivalent:

node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)

or

node.serialize(:encoding => 'UTF-8') do |config|
  config.format.as_xml
end
Calls superclass method
# File lib/nokogiri/html/document.rb, line 146
def serialize options = {}
  options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
  super
end
title() click to toggle source

Get the title string of this document. Return nil if there is no title tag.

# File lib/nokogiri/html/document.rb, line 72
def title
  title = at('//title') and title.inner_text
end
title=(text) click to toggle source

Set the title string of this document.

If a title element is already present, its content is replaced with the given text.

Otherwise, this method tries to create one at an appropriate place supplying head and/or html elements as necessary, which is inside a head element if any, right after a meta encoding/charset tag if any, and before any text node or content element (typically <body>) if any.

# File lib/nokogiri/html/document.rb, line 87
def title=(text)
  tnode = XML::Text.new(text, self)
  if title = at('//title')
    title.children = tnode
    return text
  end

  title = XML::Node.new('title', self) << tnode
  case
  when head = at('//head')
    head << title
  when meta = at('//meta[@charset]') || meta_content_type
    # better put after charset declaration
    meta.add_next_sibling(title)
  else
    set_metadata_element(title)
  end
  text
end

Private Instance Methods

meta_content_type() click to toggle source
# File lib/nokogiri/html/document.rb, line 62
def meta_content_type
  xpath('//meta[@http-equiv and boolean(@content)]').find { |node|
    node['http-equiv'] =~ /\AContent-Type\z/i
  }
end
set_metadata_element(element) click to toggle source
# File lib/nokogiri/html/document.rb, line 107
def set_metadata_element(element)
  case
  when head = at('//head')
    head << element
  when html = at('//html')
    head = html.prepend_child(XML::Node.new('head', self))
    head.prepend_child(element)
  when first = children.find { |node|
      case node
      when XML::Element, XML::Text
        true
      end
    }
    # We reach here only if the underlying document model
    # allows <html>/<head> elements to be omitted and does not
    # automatically supply them.
    first.add_previous_sibling(element)
  else
    html = add_child(XML::Node.new('html', self))
    head = html.add_child(XML::Node.new('head', self))
    head.prepend_child(element)
  end
end