class Nokogiri::XML::Document

Nokogiri::XML::Document wraps an xml document.

Nokogiri::XML::Document is the main entry point for dealing with XML documents. The Document is created by parsing an XML document. See Nokogiri::XML::Document.parse() for more information on parsing.

For searching a Document, see Nokogiri::XML::Searchable#css and Nokogiri::XML::Searchable#xpath

Constants

NCNAME_CHAR
NCNAME_RE
NCNAME_START_CHAR

See www.w3.org/TR/REC-xml-names/#ns-decl for more details. Note that we're not attempting to handle unicode characters partly because libxml2 doesn't handle unicode characters in NCNAMEs.

Attributes

errors[RW]

A list of Nokogiri::XML::SyntaxError found when parsing a document

Public Class Methods

new(version = default) click to toggle source

Create a new document with version (defaults to “1.0”)

static VALUE
new (int argc, VALUE *argv, VALUE klass)
{
  xmlDocPtr doc;
  VALUE version, rest, rb_doc ;

  rb_scan_args(argc, argv, "0*", &rest);
  version = rb_ary_entry(rest, (long)0);
  if (NIL_P(version)) { version = rb_str_new2("1.0"); }

  doc = xmlNewDoc((xmlChar *)StringValueCStr(version));
  rb_doc = noko_xml_document_wrap_with_init_args(klass, doc, argc, argv);
  return rb_doc ;
}
parse(string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML) { |options| ... } click to toggle source

Parse an XML file.

string_or_io may be a String, or any object that responds to read and close such as an IO, or StringIO.

url (optional) is the URI where this document is located.

encoding (optional) is the encoding that should be used when processing the document.

options (optional) is a configuration object that sets options during parsing, such as Nokogiri::XML::ParseOptions::RECOVER. See the Nokogiri::XML::ParseOptions for more information.

block (optional) is passed a configuration object on which parse options may be set.

By default, Nokogiri treats documents as untrusted, and so does not attempt to load DTDs or access the network. See Nokogiri::XML::ParseOptions for a complete list of options; and that module's DEFAULT_XML constant for what's set (and not set) by default.

Nokogiri.XML() is a convenience method which will call this method.

# File lib/nokogiri/xml/document.rb, line 50
def self.parse string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options

  yield options if block_given?

  url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil

  if empty_doc?(string_or_io)
    if options.strict?
      raise Nokogiri::XML::SyntaxError.new("Empty document")
    else
      return encoding ? new.tap { |i| i.encoding = encoding } : new
    end
  end

  doc = if string_or_io.respond_to?(:read)
          if string_or_io.is_a?(Pathname)
            # resolve the Pathname to the file and open it as an IO object, see #2110
            string_or_io = string_or_io.expand_path.open
            url ||= string_or_io.path
          end

          read_io(string_or_io, url, encoding, options.to_i)
        else
          # read_memory pukes on empty docs
          read_memory(string_or_io, url, encoding, options.to_i)
        end

  # do xinclude processing
  doc.do_xinclude(options) if options.xinclude?

  return doc
end
read_io(io, url, encoding, options) click to toggle source

Create a new document from an IO object

static VALUE
read_io(VALUE klass,
        VALUE io,
        VALUE url,
        VALUE encoding,
        VALUE options)
{
  const char *c_url    = NIL_P(url)      ? NULL : StringValueCStr(url);
  const char *c_enc    = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  xmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

  doc = xmlReadIO(
          (xmlInputReadCallback)noko_io_read,
          (xmlInputCloseCallback)noko_io_close,
          (void *)io,
          c_url,
          c_enc,
          (int)NUM2INT(options)
        );
  xmlSetStructuredErrorFunc(NULL, NULL);

  if (doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if (error) {
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
    } else {
      rb_raise(rb_eRuntimeError, "Could not parse document");
    }

    return Qnil;
  }

  document = noko_xml_document_wrap(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}
read_memory(string, url, encoding, options) click to toggle source

Create a new document from a String

static VALUE
read_memory(VALUE klass,
            VALUE string,
            VALUE url,
            VALUE encoding,
            VALUE options)
{
  const char *c_buffer = StringValuePtr(string);
  const char *c_url    = NIL_P(url)      ? NULL : StringValueCStr(url);
  const char *c_enc    = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
  int len               = (int)RSTRING_LEN(string);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  xmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
  doc = xmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
  xmlSetStructuredErrorFunc(NULL, NULL);

  if (doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if (error) {
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
    } else {
      rb_raise(rb_eRuntimeError, "Could not parse document");
    }

    return Qnil;
  }

  document = noko_xml_document_wrap(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}

Private Class Methods

empty_doc?(string_or_io) click to toggle source
# File lib/nokogiri/xml/document.rb, line 323
def self.empty_doc? string_or_io
  string_or_io.nil? ||
    (string_or_io.respond_to?(:empty?) && string_or_io.empty?) ||
    (string_or_io.respond_to?(:eof?) && string_or_io.eof?)
end

Public Instance Methods

<<(node_or_tags)
Alias for: add_child
add_child(node_or_tags) click to toggle source
Calls superclass method
# File lib/nokogiri/xml/document.rb, line 309
def add_child node_or_tags
  raise "A document may not have multiple root nodes." if (root && root.name != 'nokogiri_text_wrapper') && !(node_or_tags.comment? || node_or_tags.processing_instruction?)
  node_or_tags = coerce(node_or_tags)
  if node_or_tags.is_a?(XML::NodeSet)
    raise "A document may not have multiple root nodes." if node_or_tags.size > 1
    super(node_or_tags.first)
  else
    super
  end
end
Also aliased as: <<
canonicalize(mode=XML_C14N_1_0,inclusive_namespaces=nil,with_comments=false) click to toggle source
canonicalize { |obj, parent| ... }

Canonicalize a document and return the results. Takes an optional block that takes two parameters: the obj and that node's parent. The obj will be either a Nokogiri::XML::Node, or a Nokogiri::XML::Namespace The block must return a non-nil, non-false value if the obj passed in should be included in the canonicalized document.

static VALUE
rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self)
{
  VALUE mode;
  VALUE incl_ns;
  VALUE with_comments;
  xmlChar **ns;
  long ns_len, i;

  xmlDocPtr doc;
  xmlOutputBufferPtr buf;
  xmlC14NIsVisibleCallback cb = NULL;
  void *ctx = NULL;

  VALUE rb_cStringIO;
  VALUE io;

  rb_scan_args(argc, argv, "03", &mode, &incl_ns, &with_comments);

  Data_Get_Struct(self, xmlDoc, doc);

  rb_cStringIO = rb_const_get_at(rb_cObject, rb_intern("StringIO"));
  io           = rb_class_new_instance(0, 0, rb_cStringIO);
  buf          = xmlAllocOutputBuffer(NULL);

  buf->writecallback = (xmlOutputWriteCallback)noko_io_write;
  buf->closecallback = (xmlOutputCloseCallback)noko_io_close;
  buf->context       = (void *)io;

  if (rb_block_given_p()) {
    cb = block_caller;
    ctx = (void *)rb_block_proc();
  }

  if (NIL_P(incl_ns)) {
    ns = NULL;
  } else {
    Check_Type(incl_ns, T_ARRAY);
    ns_len = RARRAY_LEN(incl_ns);
    ns = calloc((size_t)ns_len + 1, sizeof(xmlChar *));
    for (i = 0 ; i < ns_len ; i++) {
      VALUE entry = rb_ary_entry(incl_ns, i);
      ns[i] = (xmlChar *)StringValueCStr(entry);
    }
  }


  xmlC14NExecute(doc, cb, ctx,
                 (int)(NIL_P(mode)        ? 0 : NUM2INT(mode)),
                 ns,
                 (int)      RTEST(with_comments),
                 buf);

  xmlOutputBufferClose(buf);

  return rb_funcall(io, rb_intern("string"), 0);
}
Alias for: dup
collect_namespaces() click to toggle source

Recursively get all namespaces from this node and its subtree and return them as a hash.

For example, given this document:

<root xmlns:foo="bar">
  <bar xmlns:hello="world" />
</root>

This method will return:

{ 'xmlns:foo' => 'bar', 'xmlns:hello' => 'world' }

WARNING: this method will clobber duplicate names in the keys. For example, given this document:

<root xmlns:foo="bar">
  <bar xmlns:foo="baz" />
</root>

The hash returned will look like this: { 'xmlns:foo' => 'bar' }

Non-prefixed default namespaces (as in “xmlns=”) are not included in the hash.

Note that this method does an xpath lookup for nodes with namespaces, and as a result the order may be dependent on the implementation of the underlying XML library.

# File lib/nokogiri/xml/document.rb, line 234
def collect_namespaces
  xpath("//namespace::*").inject({}) do |hash, ns|
    hash[["xmlns",ns.prefix].compact.join(":")] = ns.href if ns.prefix != "xml"
    hash
  end
end
create_cdata(string, &block) click to toggle source

Create a CDATA Node containing string

# File lib/nokogiri/xml/document.rb, line 185
def create_cdata string, &block
  Nokogiri::XML::CDATA.new self, string.to_s, &block
end
create_comment(string, &block) click to toggle source

Create a Comment Node containing string

# File lib/nokogiri/xml/document.rb, line 190
def create_comment string, &block
  Nokogiri::XML::Comment.new self, string.to_s, &block
end
create_element(name, *contents_or_attrs, &block) click to toggle source

Create a new Element with name sharing GC lifecycle with the document, optionally setting contents or attributes.

Arguments may be passed to initialize the element:

  • a Hash argument will be used to set attributes

  • a non-Hash object that responds to #to_s will be used to set the new node's contents

A block may be passed to mutate the node.

@param name [String] @param contents_or_attrs [#to_s,Hash] @yieldparam node [Nokogiri::XML::Element] @return [Nokogiri::XML::Element]

@example An empty element without attributes

doc.create_element("div")
# => <div></div>

@example An element with contents

doc.create_element("div", "contents")
# => <div>contents</div>

@example An element with attributes

doc.create_element("div", {"class" => "container"})
# => <div class='container'></div>

@example An element with contents and attributes

doc.create_element("div", "contents", {"class" => "container"})
# => <div class='container'>contents</div>

@example Passing a block to mutate the element

doc.create_element("div") { |node| node["class"] = "blue" if before_noon? }
# File lib/nokogiri/xml/document.rb, line 155
def create_element(name, *contents_or_attrs, &block)
  elm = Nokogiri::XML::Element.new(name, self, &block)
  contents_or_attrs.each do |arg|
    case arg
    when Hash
      arg.each do |k, v|
        key = k.to_s
        if key =~ NCNAME_RE
          ns_name = Regexp.last_match(1)
          elm.add_namespace_definition(ns_name, v)
        else
          elm[k.to_s] = v.to_s
        end
      end
    else
      elm.content = arg
    end
  end
  if ns = elm.namespace_definitions.find { |n| n.prefix.nil? || (n.prefix == '') }
    elm.namespace = ns
  end
  elm
end
create_entity(name, type, external_id, system_id, content) click to toggle source

Create a new entity named name.

type is an integer representing the type of entity to be created, and it defaults to Nokogiri::XML::EntityDecl::INTERNAL_GENERAL. See the constants on Nokogiri::XML::EntityDecl for more information.

external_id, system_id, and content set the External ID, System ID, and content respectively. All of these parameters are optional.

static VALUE
create_entity(int argc, VALUE *argv, VALUE self)
{
  VALUE name;
  VALUE type;
  VALUE external_id;
  VALUE system_id;
  VALUE content;
  xmlEntityPtr ptr;
  xmlDocPtr doc ;

  Data_Get_Struct(self, xmlDoc, doc);

  rb_scan_args(argc, argv, "14", &name, &type, &external_id, &system_id,
               &content);

  xmlResetLastError();
  ptr = xmlAddDocEntity(
          doc,
          (xmlChar *)(NIL_P(name)        ? NULL                        : StringValueCStr(name)),
          (int)(NIL_P(type)        ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(type)),
          (xmlChar *)(NIL_P(external_id) ? NULL                        : StringValueCStr(external_id)),
          (xmlChar *)(NIL_P(system_id)   ? NULL                        : StringValueCStr(system_id)),
          (xmlChar *)(NIL_P(content)     ? NULL                        : StringValueCStr(content))
        );

  if (NULL == ptr) {
    xmlErrorPtr error = xmlGetLastError();
    if (error) {
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
    } else {
      rb_raise(rb_eRuntimeError, "Could not create entity");
    }

    return Qnil;
  }

  return noko_xml_node_wrap(cNokogiriXmlEntityDecl, (xmlNodePtr)ptr);
}
create_text_node(string, &block) click to toggle source

Create a Text Node with string

# File lib/nokogiri/xml/document.rb, line 180
def create_text_node string, &block
  Nokogiri::XML::Text.new string.to_s, self, &block
end
decorate(node) click to toggle source

Apply any decorators to node

# File lib/nokogiri/xml/document.rb, line 282
def decorate node
  return unless @decorators
  @decorators.each { |klass,list|
    next unless node.is_a?(klass)
    list.each { |moodule| node.extend(moodule) }
  }
end
decorators(key) click to toggle source

Get the list of decorators given key

# File lib/nokogiri/xml/document.rb, line 242
def decorators key
  @decorators ||= Hash.new
  @decorators[key] ||= []
end
document() click to toggle source

A reference to self

# File lib/nokogiri/xml/document.rb, line 200
def document
  self
end
dup click to toggle source

Copy this Document. An optional depth may be passed in, but it defaults to a deep copy. 0 is a shallow copy, 1 is a deep copy.

static VALUE
duplicate_document(int argc, VALUE *argv, VALUE self)
{
  xmlDocPtr doc, dup;
  VALUE copy;
  VALUE level;

  if (rb_scan_args(argc, argv, "01", &level) == 0) {
    level = INT2NUM((long)1);
  }

  Data_Get_Struct(self, xmlDoc, doc);

  dup = xmlCopyDoc(doc, (int)NUM2INT(level));

  if (dup == NULL) { return Qnil; }

  dup->type = doc->type;
  copy = noko_xml_document_wrap(rb_obj_class(self), dup);
  rb_iv_set(copy, "@errors", rb_iv_get(self, "@errors"));
  return copy ;
}
Also aliased as: clone
encoding click to toggle source

Get the encoding for this Document

static VALUE
encoding(VALUE self)
{
  xmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);

  if (!doc->encoding) { return Qnil; }
  return NOKOGIRI_STR_NEW2(doc->encoding);
}
encoding= encoding click to toggle source

Set the encoding string for this Document

static VALUE
set_encoding(VALUE self, VALUE encoding)
{
  xmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);

  if (doc->encoding) {
    free((char *)(uintptr_t) doc->encoding);  /* avoid gcc cast warning */
  }

  doc->encoding = xmlStrdup((xmlChar *)StringValueCStr(encoding));

  return encoding;
}
fragment(tags = nil) click to toggle source

Create a Nokogiri::XML::DocumentFragment from tags Returns an empty fragment if tags is nil.

# File lib/nokogiri/xml/document.rb, line 301
def fragment tags = nil
  DocumentFragment.new(self, tags, self.root)
end
name() click to toggle source

The name of this document. Always returns “document”

# File lib/nokogiri/xml/document.rb, line 195
def name
  'document'
end
namespaces() click to toggle source

Get the hash of namespaces on the root Nokogiri::XML::Node

# File lib/nokogiri/xml/document.rb, line 294
def namespaces
  root ? root.namespaces : {}
end
remove_namespaces! click to toggle source

Remove all namespaces from all nodes in the document.

This could be useful for developers who either don't understand namespaces or don't care about them.

The following example shows a use case, and you can decide for yourself whether this is a good thing or not:

doc = Nokogiri::XML <<-EOXML
   <root>
     <car xmlns:part="http://general-motors.com/">
       <part:tire>Michelin Model XGV</part:tire>
     </car>
     <bicycle xmlns:part="http://schwinn.com/">
       <part:tire>I'm a bicycle tire!</part:tire>
     </bicycle>
   </root>
   EOXML

doc.xpath("//tire").to_s # => ""
doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => "<part:tire>Michelin Model XGV</part:tire>"
doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => "<part:tire>I'm a bicycle tire!</part:tire>"

doc.remove_namespaces!

doc.xpath("//tire").to_s # => "<tire>Michelin Model XGV</tire><tire>I'm a bicycle tire!</tire>"
doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => ""
doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => ""

For more information on why this probably is not a good thing in general, please direct your browser to tenderlovemaking.com/2009/04/23/namespaces-in-xml.html

static VALUE
remove_namespaces_bang(VALUE self)
{
  xmlDocPtr doc ;
  Data_Get_Struct(self, xmlDoc, doc);

  recursively_remove_namespaces_from_node((xmlNodePtr)doc);
  return self;
}
root click to toggle source

Get the root node for this document.

static VALUE
rb_xml_document_root(VALUE self)
{
  xmlDocPtr c_document;
  xmlNodePtr c_root;

  Data_Get_Struct(self, xmlDoc, c_document);

  c_root = xmlDocGetRootElement(c_document);
  if (!c_root) {
    return Qnil;
  }

  return noko_xml_node_wrap(Qnil, c_root) ;
}
root= click to toggle source

Set the root element on this document

static VALUE
rb_xml_document_root_set(VALUE self, VALUE rb_new_root)
{
  xmlDocPtr c_document;
  xmlNodePtr c_new_root = NULL, c_current_root;

  Data_Get_Struct(self, xmlDoc, c_document);

  c_current_root = xmlDocGetRootElement(c_document);
  if (c_current_root) {
    xmlUnlinkNode(c_current_root);
    noko_xml_document_pin_node(c_current_root);
  }

  if (!NIL_P(rb_new_root)) {
    if (!rb_obj_is_kind_of(rb_new_root, cNokogiriXmlNode)) {
      rb_raise(rb_eArgError,
               "expected Nokogiri::XML::Node but received %"PRIsVALUE,
               rb_obj_class(rb_new_root));
    }

    Data_Get_Struct(rb_new_root, xmlNode, c_new_root);

    /* If the new root's document is not the same as the current document,
     * then we need to dup the node in to this document. */
    if (c_new_root->doc != c_document) {
      c_new_root = xmlDocCopyNode(c_new_root, c_document, 1);
      if (!c_new_root) {
        rb_raise(rb_eRuntimeError, "Could not reparent node (xmlDocCopyNode)");
      }
    }
  }

  xmlDocSetRootElement(c_document, c_new_root);

  return rb_new_root;
}
slop!() click to toggle source

Explore a document with shortcut methods. See Nokogiri::Slop for details.

Note that any nodes that have been instantiated before slop! is called will not be decorated with sloppy behavior. So, if you're in irb, the preferred idiom is:

irb> doc = Nokogiri::Slop my_markup

and not

irb> doc = Nokogiri::HTML my_markup
... followed by irb's implicit inspect (and therefore instantiation of every node) ...
irb> doc.slop!
... which does absolutely nothing.
# File lib/nokogiri/xml/document.rb, line 271
def slop!
  unless decorators(XML::Node).include? Nokogiri::Decorators::Slop
    decorators(XML::Node) << Nokogiri::Decorators::Slop
    decorate!
  end

  self
end
url click to toggle source

Get the url name for this document.

static VALUE
url(VALUE self)
{
  xmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);

  if (doc->URL) { return NOKOGIRI_STR_NEW2(doc->URL); }

  return Qnil;
}
validate() click to toggle source

Validate this Document against it's DTD. Returns a list of errors on the document or nil when there is no DTD.

# File lib/nokogiri/xml/document.rb, line 250
def validate
  return nil unless internal_subset
  internal_subset.validate self
end
version click to toggle source

Get the XML version for this Document

static VALUE
version(VALUE self)
{
  xmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);

  if (!doc->version) { return Qnil; }
  return NOKOGIRI_STR_NEW2(doc->version);
}

Private Instance Methods

inspect_attributes() click to toggle source
# File lib/nokogiri/xml/document.rb, line 331
def inspect_attributes
  [:name, :children]
end