Index: test/test_parser.rb
===================================================================
--- test/test_parser.rb (revision 159)
+++ test/test_parser.rb (working copy)
@@ -67,8 +67,8 @@
assert_equal 'link1', (doc/:p/:a).first['id']
assert_equal 'link1', doc.search('p').at('a').get_attribute('id')
assert_equal 'link2', (doc/'p').filter('.ohmy').search('a').first.get_attribute('id')
- assert_equal (doc/'p')[2], (doc/'p').filter(':nth(2)')[0]
- assert_equal (doc/'p')[2], (doc/'p').filter('[3]')[0]
+ assert_equal((doc/'p')[2], (doc/'p').filter(':nth(2)')[0])
+ assert_equal((doc/'p')[2], (doc/'p').filter('[3]')[0])
assert_equal 4, (doc/'p').filter('*').length
assert_equal 4, (doc/'p').filter('* *').length
eles = (doc/'p').filter('.ohmy')
@@ -373,7 +373,7 @@
</a>
END
doc = Hpricot::XML(chunk)
- assert (doc/"//t:sam").size > 0 # at least this should probably work
+ assert ((doc/"//t:sam").size > 0) # at least this should probably work
# assert (doc/"//sam").size > 0 # this would be nice
end
end
Index: ext/hpricot_scan/hpricot_scan.rl
===================================================================
--- ext/hpricot_scan/hpricot_scan.rl (revision 159)
+++ ext/hpricot_scan/hpricot_scan.rl (working copy)
@@ -8,19 +8,37 @@
*/
#include <ruby.h>

+/* If using Ruby earlier than 1.9.0 */
#ifndef RARRAY_LEN
-#define RARRAY_LEN(arr) RARRAY(arr)->len
-#define RSTRING_LEN(str) RSTRING(str)->len
-#define RSTRING_PTR(str) RSTRING(str)->ptr
+ #define RARRAY_LEN(ptr) RARRAY(ptr)->len
+ #define RARRAY_PTR(ptr) RARRAY(ptr)->ptr
+
+ #define RSTRING_EMBED_LEN_MAX ((sizeof(VALUE)*3)/sizeof(char)-1)
+ #define RSTRING_NOEMBED FL_USER1
+ #define RSTRING_EMBED_LEN_MASK (FL_USER2|FL_USER3|FL_USER4|FL_USER5|FL_USER6)
+ #define RSTRING_EMBED_LEN_SHIFT (FL_USHIFT+2)
+ #define RSTRING_LEN(str) \
+ (!(RBASIC(str)->flags & RSTRING_NOEMBED) ? \
+ (long)((RBASIC(str)->flags >> RSTRING_EMBED_LEN_SHIFT) & \
+ (RSTRING_EMBED_LEN_MASK >> RSTRING_EMBED_LEN_SHIFT)) : \
+ RSTRING(str)->as.heap.len)
+ #define RSTRING_PTR(str) \
+ (!(RBASIC(str)->flags & RSTRING_NOEMBED) ? \
+ RSTRING(str)->as.ary : \
+ RSTRING(str)->as.heap.ptr)
+ #define RSTRING_END(str) (RSTRING_PTR(str)+RSTRING_LEN(str))
#endif

#define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"

static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
sym_cdata, sym_text;
+
static VALUE rb_eHpricotParseError;
+
static ID s_read, s_to_str;

+/* Collect raw content as ruby string and yield it to block passed to Hpricot.scan */
#define ELE(N) \
if (tokend > tokstart || text == 1) { \
VALUE raw_string = Qnil; \
@@ -38,9 +56,9 @@
N = rb_str_new(mark_##N, E - mark_##N);

#define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
-
#define SLIDE(N) if ( mark_##N > tokstart ) mark_##N = buf + (mark_##N - tokstart);

+/* Creates Hash of attributes from key/value pair */
#define ATTR(K, V) \
if (!NIL_P(K)) { \
if (NIL_P(attr)) attr = rb_hash_new(); \
@@ -116,6 +134,36 @@

#define BUFSIZE 16384

+/*
+ * Creates a 4 elements array for token and yields it to the block/proc given to Hpricot.scan method in Ruby.
+ * _why explains it on his old blog, Redhanded:
+ *
+ * (1) a symbol describing the element type,
+ * (2) the tag name or text content,
+ * (3) an attributes hash,
+ * (4) the raw string which formed this token.
+ *
+ * See http://redhanded.hobix.com/inspect/okayGiveHpricot02AGo.html for more details.
+ *
+ * Element types:
+ *
+ * 1. stag — starting tag like <div>
+ * 2. etag — ending tag like </div>
+ * 3. text — text node like Hpricot is a loosy HTML parser written in Ruby and C
+ * 4. emtpytag — empty tag note like <br />
+ * 5. comment — comment node like <!-- Footer -->
+ * 6. xmldecl
+ *
+ * Examples (using some tokens from test/files/boingboing.html in Hpricot fixture files)
+ *
+ * [:stag, "a", {"href"=>"http://www.pageflakes.com/subscribe.aspx?url=http://feeds.feedburner.com/boingboing/iBag", "title"=>"Boing Boing", "type"=>"application/rss+xml"}, "<a href=\"http://www.pageflakes.com/subscribe.aspx?url=http://feeds.feedburner.com/boingboing/iBag\" title=\"Boing Boing\" type=\"application/rss+xml\">"]
+ * [:emptytag, "img", {"src"=>"http://www.boingboing.net/images/pageflakes.gif", "height"=>"17", "alt"=>"Subscribe in Pageflakes", "style"=>"border:0", "width"=>"81"}, "<img src=\"http://www.boingboing.net/images/pageflakes.gif\" alt=\"Subscribe in Pageflakes\" height=\"17\" width=\"81\" style=\"border:0\"/>"]
+ * [:etag, "a", nil, "</a>"]
+ * [:text, " on how the two of them found an entirely new song of Scott Joplin's. (its the fourth of four mini-segments.)\n\n\n", nil, " on how the two of them found an entirely new song of Scott Joplin's. (its the fourth of four mini-segments.)\n\n\n"]
+ * [:comment, " Begin: AdBrite ", nil, nil]
+ * [:cdata, " predefined content comes here ", nil, nil]
+ *
+ */
void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
{
VALUE ary;
@@ -136,11 +184,9 @@
{
int cs, act, have = 0, nread = 0, curline = 1, text = 0;
char *tokstart = 0, *tokend = 0, *buf = NULL;
-
VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
int done = 0, ele_open = 0, buffer_size = 0;
-
int taint = OBJ_TAINTED( port );
if ( !rb_respond_to( port, s_read ) )
{
@@ -154,7 +200,6 @@
rb_raise( rb_eArgError, "bad Hpricot argument, String or IO only please." );
}
}
-
buffer_size = BUFSIZE;
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
@@ -163,21 +208,18 @@
}
}
buf = ALLOC_N(char, buffer_size);
+ %% write init;

- %% write init;
-
while ( !done ) {
VALUE str;
char *p = buf + have, *pe;
int len, space = buffer_size - have;
-
if ( space == 0 ) {
/* We've used up the entire buffer storing an already-parsed token
* prefix that must be preserved. Likely caused by super-long attributes.
* See ticket #13. */
rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING_PTR(tag), curline);
}
-
if ( rb_respond_to( port, s_read ) )
{
str = rb_funcall( port, s_read, 1, INT2FIX(space) );
@@ -186,21 +228,18 @@
{
str = rb_str_substr( port, nread, space );
}
-
StringValue(str);
memcpy( p, RSTRING_PTR(str), RSTRING_LEN(str) );
len = RSTRING_LEN(str);
nread += len;

- /* If this is the last buffer, tack on an EOF. */
if ( len < space ) {
p[len++] = 0;
done = 1;
}
-
pe = p + len;
%% write exec;
-
+
if ( cs == hpricot_scan_error ) {
free(buf);
if ( !NIL_P(tag) )
@@ -212,17 +251,17 @@
rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
}
}
-
+
if ( done && ele_open )
{
ele_open = 0;
+
if (tokstart > 0) {
mark_tag = tokstart;
tokstart = 0;
text = 1;
}
}
-
if ( tokstart == 0 )
{
have = 0;
@@ -257,23 +296,37 @@
}
free(buf);
}
-
+/* Initializes C extension */
void Init_hpricot_scan()
{
+ /* Define a ruby module named Hpricot */
VALUE mHpricot = rb_define_module("Hpricot");
+ /* Define a singleton method on it */
rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
+ /* Associate that singleton method with scan function in C */
rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
+ /* Register Ruby exception class under Hpricot module with Exception it's parent */
rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException);
-
+ /* Get internal VM id of read string */
s_read = rb_intern("read");
+ /* Get internal VM id of to_str string */
s_to_str = rb_intern("to_str");
+ /* Get :xmldecl symbol via VM id of string */
sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
+ /* Get :doctype symbol via VM id of string */
sym_doctype = ID2SYM(rb_intern("doctype"));
+ /* Get :procins symbol via VM id of string */
sym_procins = ID2SYM(rb_intern("procins"));
+ /* Get :stag symbol via VM id of string */
sym_stag = ID2SYM(rb_intern("stag"));
+ /* Get :etag symbol via VM id of string */
sym_etag = ID2SYM(rb_intern("etag"));
+ /* Get :emptytag symbol via VM id of string */
sym_emptytag = ID2SYM(rb_intern("emptytag"));
+ /* Get :comment symbol via VM id of string */
sym_comment = ID2SYM(rb_intern("comment"));
+ /* Get :cdata symbol via VM id of string */
sym_cdata = ID2SYM(rb_intern("cdata"));
+ /* Get :text symbol via VM id of string */
sym_text = ID2SYM(rb_intern("text"));
}
Index: lib/hpricot/tags.rb
===================================================================
--- lib/hpricot/tags.rb (revision 159)
+++ lib/hpricot/tags.rb (working copy)
@@ -14,10 +14,11 @@
Attrs = AttrCore + AttrI18n + AttrEvents

# All the tags and attributes from XHTML 1.0 Strict
- class XHTMLStrict
+ class XHTMLStrict
class << self
attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
end
+
@doctype = ["-//W3C//DTD XHTML 1.0 Strict//EN", "DTD/xhtml1-strict.dtd"]
@tagset = {
:html => AttrI18n + [:id, :xmlns],
Index: lib/hpricot/parse.rb
===================================================================
--- lib/hpricot/parse.rb (revision 159)
+++ lib/hpricot/parse.rb (working copy)
@@ -1,5 +1,6 @@
require 'hpricot/htmlinfo'

+# Shortcut for Hpricot.parse
def Hpricot(input = nil, opts = {}, &blk)
Hpricot.parse(input, opts, &blk)
end
Index: lib/hpricot/tag.rb
===================================================================
--- lib/hpricot/tag.rb (revision 159)
+++ lib/hpricot/tag.rb (working copy)
@@ -21,9 +21,11 @@

class BaseEle
attr_accessor :raw_string, :parent
+
def html_quote(str)
"\"" + str.gsub('"', '\\"') + "\""
end
+
def if_output(opts)
if opts[:preserve] and not @raw_string.nil?
@raw_string
@@ -31,10 +33,15 @@
yield opts
end
end
- def pathname; self.name end
+
+ def pathname;
+ self.name
+ end
+
def altered!
@raw_string = nil
end
+
def self.alterable(*fields)
attr_accessor(*fields)
fields.each do |f|
@@ -48,14 +55,20 @@

class Elem
attr_accessor :stag, :etag, :children
+
def initialize(stag, children=nil, etag=nil)
@stag, @etag = stag, etag
@children = children ? children.each { |c| c.parent = self } : []
end
- def empty?; @children.empty? end
+
+ def empty?;
+ @children.empty?
+ end
+
[:name, :raw_attributes, :parent, :altered!].each do |m|
[m, "#{m}="].each { |m2| define_method(m2) { |*a| [@etag, @stag].inject { |_,t| t.send(m2, *a) if t and t.respond_to?(m2) } } }
end
+
def attributes
if raw_attributes
raw_attributes.inject({}) do |hsh, (k, v)|
@@ -64,6 +77,7 @@
end
end
end
+
def to_plain_text
if self.name == 'br'
"\n"
@@ -77,7 +91,11 @@
super
end
end
- def pathname; self.name end
+
+ def pathname;
+ self.name
+ end
+
def output(out, opts = {})
if empty? and ElementContent[@stag.name] == :EMPTY
@stag.output(out, opts.merge(:style => :empty))
@@ -93,13 +111,17 @@
out
end
end
-
+
+ # Start tag
class STag < BaseEle
+
def initialize(name, attributes=nil)
@name = name.to_s
@raw_attributes = attributes || {}
end
+
alterable :name, :raw_attributes
+
def attributes_as_html
if @raw_attributes
@raw_attributes.map do |aname, aval|
@@ -108,6 +130,7 @@
end.join
end
end
+
def output(out, opts = {})
out <<
if_output(opts) do
@@ -117,12 +140,16 @@
end
end
end
-
+
+ # End tag
class ETag < BaseEle
+
def initialize(qualified_name)
@name = qualified_name.to_s
end
+
alterable :name
+
def output(out, opts = {})
out <<
if_output(opts) do
@@ -134,16 +161,23 @@
class BogusETag < ETag
def output(out, opts = {}); out << if_output(opts) { '' }; end
end
-
+
+ # Text node
class Text < BaseEle
def initialize(text)
@content = text
end
+
alterable :content
- def pathname; "text()" end
+
+ def pathname
+ "text()"
+ end
+
def to_s
Hpricot.uxs(@content)
end
+
alias_method :inner_text, :to_s
alias_method :to_plain_text, :to_s
def output(out, opts = {})
@@ -153,10 +187,12 @@
end
end
end
-
+
+ # CDATA section
class CData < Text
alias_method :to_s, :content
alias_method :to_plain_text, :content
+
def output(out, opts = {})
out <<
if_output(opts) do
@@ -164,13 +200,20 @@
end
end
end
-
+
+ # XML declaration
class XMLDecl < BaseEle
+
def initialize(version, encoding, standalone)
@version, @encoding, @standalone = version, encoding, standalone
end
+
alterable :version, :encoding, :standalone
- def pathname; "xmldecl()" end
+
+ def pathname
+ "xmldecl()"
+ end
+
def output(out, opts = {})
out <<
if_output(opts) do
@@ -181,13 +224,19 @@
end
end
end
-
+
+ # DOCTYPE declaration
class DocType < BaseEle
def initialize(target, pubid, sysid)
@target, @public_id, @system_id = target, pubid, sysid
end
+
alterable :target, :public_id, :system_id
- def pathname; "doctype()" end
+
+ def pathname
+ "doctype()"
+ end
+
def output(out, opts = {})
out <<
if_output(opts) do
@@ -199,11 +248,17 @@
end

class ProcIns < BaseEle
+
def initialize(target, content)
@target, @content = target, content
end
- def pathname; "procins()" end
+
+ def pathname
+ "procins()"
+ end
+
alterable :target, :content
+
def output(out, opts = {})
out <<
if_output(opts) do
@@ -218,8 +273,13 @@
def initialize(content)
@content = content
end
- def pathname; "comment()" end
+
+ def pathname
+ "comment()"
+ end
+
alterable :content
+
def output(out, opts = {})
out <<
if_output(opts) do
Index: lib/hpricot/elements.rb
===================================================================
--- lib/hpricot/elements.rb (revision 159)
+++ lib/hpricot/elements.rb (working copy)
@@ -1,54 +1,54 @@
module Hpricot
-# Once you've matched a list of elements, you will often need to handle them as
-# a group. Or you may want to perform the same action on each of them.
-# Hpricot::Elements is an extension of Ruby's array class, with some methods
-# added for altering elements contained in the array.
-#
-# If you need to create an element array from regular elements:
-#
-# Hpricot::Elements[ele1, ele2, ele3]
-#
-# Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
-# Hpricot::Doc, etc.)
-#
-# == Continuing Searches
-#
-# Usually the Hpricot::Elements you're working on comes from a search you've
-# done. Well, you can continue searching the list by using the same <tt>at</tt>
-# and <tt>search</tt> methods you can use on plain elements.
-#
-# elements = doc.search("/div/p")
-# elements = elements.search("/a[@href='http://hoodwink.d/']")
-# elements = elements.at("img")
-#
-# == Altering Elements
-#
-# When you're altering elements in the list, your changes will be reflected in
-# the document you started searching from.
-#
-# doc = Hpricot("That's my <b>spoon</b>, Tyler.")
-# doc.at("b").swap("<i>fork</i>")
-# doc.to_html
-# #=> "That's my <i>fork</i>, Tyler."
-#
-# == Getting More Detailed
-#
-# If you can't find a method here that does what you need, you may need to
-# loop through the elements and find a method in Hpricot::Container::Trav
-# which can do what you need.
-#
-# For example, you may want to search for all the H3 header tags in a document
-# and grab all the tags underneath the header, but not inside the header.
-# A good method for this is <tt>next_sibling</tt>:
-#
-# doc.search("h3").each do |h3|
-# while ele = h3.next_sibling
-# ary << ele # stuff away all the elements under the h3
-# end
-# end
-#
-# Most of the useful element methods are in the mixins Hpricot::Traverse
-# and Hpricot::Container::Trav.
+ # Once you've matched a list of elements, you will often need to handle them as
+ # a group. Or you may want to perform the same action on each of them.
+ # Hpricot::Elements is an extension of Ruby's array class, with some methods
+ # added for altering elements contained in the array.
+ #
+ # If you need to create an element array from regular elements:
+ #
+ # Hpricot::Elements[ele1, ele2, ele3]
+ #
+ # Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
+ # Hpricot::Doc, etc.)
+ #
+ # == Continuing Searches
+ #
+ # Usually the Hpricot::Elements you're working on comes from a search you've
+ # done. Well, you can continue searching the list by using the same <tt>at</tt>
+ # and <tt>search</tt> methods you can use on plain elements.
+ #
+ # elements = doc.search("/div/p")
+ # elements = elements.search("/a[@href='http://hoodwink.d/']")
+ # elements = elements.at("img")
+ #
+ # == Altering Elements
+ #
+ # When you're altering elements in the list, your changes will be reflected in
+ # the document you started searching from.
+ #
+ # doc = Hpricot("That's my <b>spoon</b>, Tyler.")
+ # doc.at("b").swap("<i>fork</i>")
+ # doc.to_html
+ # #=> "That's my <i>fork</i>, Tyler."
+ #
+ # == Getting More Detailed
+ #
+ # If you can't find a method here that does what you need, you may need to
+ # loop through the elements and find a method in Hpricot::Container::Trav
+ # which can do what you need.
+ #
+ # For example, you may want to search for all the H3 header tags in a document
+ # and grab all the tags underneath the header, but not inside the header.
+ # A good method for this is <tt>next_sibling</tt>:
+ #
+ # doc.search("h3").each do |h3|
+ # while ele = h3.next_sibling
+ # ary << ele # stuff away all the elements under the h3
+ # end
+ # end
+ #
+ # Most of the useful element methods are in the mixins Hpricot::Traverse
+ # and Hpricot::Container::Trav.
class Elements < Array

# Searches this list for any elements (or children of these elements) matching
@@ -198,7 +198,7 @@
#
# This example adds a <tt>#top</tt> anchor to each link.
#
- def attr key, value = nil, &blk
+ def attr(key, value = nil, &blk)
if value or blk
each do |el|
el.set_attribute(key, value || blk[el])
@@ -219,7 +219,7 @@
# (doc/"p").add_class("bacon")
#
# Now all paragraphs will have class="bacon".
- def add_class class_name
+ def add_class(class_name)
each do |el|
next unless el.respond_to? :get_attribute
classes = el.get_attribute('class').to_s.split(" ")
@@ -232,7 +232,7 @@
#
# (doc/"input").remove_attr("disabled")
#
- def remove_attr name
+ def remove_attr(name)
each do |el|
next unless el.respond_to? :remove_attribute
el.remove_attribute(name)
@@ -240,15 +240,15 @@
self
end

- # Removes a class from all matched elements.
+ # Removes a CSS class from all matched elements.
#
# (doc/"span").remove_class("lightgrey")
#
- # Or, to remove all classes:
+ # Or, to remove all CSS classes:
#
# (doc/"span").remove_class
#
- def remove_class name = nil
+ def remove_class(name = nil)
each do |el|
next unless el.respond_to? :get_attribute
if name
@@ -260,55 +260,114 @@
end
self
end
-
+ alias_method :remove_css_class, :remove_class
+
+ # Regexp to parse attribute selectors like li[@class='search_item']
ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^\]'"]*)'?"? *\]!i
+ # [ ]
BRACK_RE = %r!(\[) *([^\]]*) *\]+!i
+ # Function like :not(...) or :first(...) or :last(...)
FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
+ # Custom function
CUST_RE = %r!(:)([a-zA-Z0-9\*_-]*)()!
CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!

def self.filter(nodes, expr, truth = true)
- until expr.empty?
- _, *m = *expr.match(/^(?:#{ATTR_RE}|#{BRACK_RE}|#{FUNC_RE}|#{CUST_RE}|#{CATCH_RE})/)
- break unless _
-
- expr = $'
- m.compact!
- if m[0] == '@'
- m[0] = "@#{m.slice!(2,1)}"
+ until expr.empty?
+ # Take groups out of match
+ #
+ # Continuing with example of li[@class='search_item'] m will contain the following:
+ #
+ # ["@", "class", nil, "=", "search_item"]
+ #
+ match, *match_groups = *expr.match(/^(?:#{ATTR_RE}|#{BRACK_RE}|#{FUNC_RE}|#{CUST_RE}|#{CATCH_RE})/)
+
+ # Break if no match happened at all
+ break unless match
+
+ # Post match
+ expr = $'
+ # Remove nils
+ match_groups.compact!
+
+ origin = match_groups.dup
+
+ # If dealing with attribute selector
+ if match_groups[0] == '@'
+ # In example above ["@", "class", "=", "search_item"] becomes ["@=", "class", "search_item"]
+ # Array#to_s behaviour changed in Ruby 1.9.0
+ match_groups[0] = "@#{[match_groups.slice!(2, 1)].flatten.first}"
+ end
+
+ step_one = match_groups.dup
+
+ # Argument looks like this: ':nth(2)'
+ # Or like this: '[3]'
+ if match_groups[0] == '[' && match_groups[1] =~ /^\d+$/
+ match_groups = [":", "nth", match_groups[1].to_i-1]
+ end
+
+ # :not function
+ # argument looks like this: 'p:not(.ohmy)'
+ if match_groups[0] == ":" && match_groups[1] == "not"
+ nodes, = Elements.filter(nodes, match_groups[2], false)
+ # :even or :odd functions
+ # argument looks like this: 'p:even' or 'p:odd'
+ elsif "#{match_groups[0]}#{match_groups[1]}" =~ /^(:even|:odd)$/
+ nodes = select_even_or_idd_nodes(nodes, match_groups[1])
+
+ # :first or :last functions
+ # argument looks like this: 'p:last' or 'p:first'
+ # just take first or last element from nodes array
+ elsif "#{match_groups[0]}#{match_groups[1]}" =~ /^(:first|:last)$/
+ nodes = [nodes.send(match_groups[1])]
+ else
+ meth = "filter[#{match_groups[0]}#{match_groups[1]}]" unless match_groups[0].empty?
+ #puts "origin: #{origin.inspect} => #{step_one} => #{match_groups[0]} and #{meth}" if ENV['TRACE']
+
+ # If traversing method we are going to use is known
+ if meth and Traverse.method_defined?(meth)
+ args = match_groups[2..-1]
+ else
+ # filter[@=] or filter[text()~=] or things like that
+ meth = "filter[#{match_groups[0]}]"
+ # If traversing method we are going to use is known
+ if Traverse.method_defined? meth
+ args = match_groups[1..-1]
+ # puts "Traverse method defined, args: #{args}"
end
-
- if m[0] == '[' && m[1] =~ /^\d+$/
- m = [":", "nth", m[1].to_i-1]
- end
-
- if m[0] == ":" && m[1] == "not"
- nodes, = Elements.filter(nodes, m[2], false)
- elsif "#{m[0]}#{m[1]}" =~ /^(:even|:odd)$/
- new_nodes = []
- nodes.each_with_index {|n,i| new_nodes.push(n) if (i % 2 == (m[1] == "even" ? 0 : 1)) }
- nodes = new_nodes
- elsif "#{m[0]}#{m[1]}" =~ /^(:first|:last)$/
- nodes = [nodes.send(m[1])]
- else
- meth = "filter[#{m[0]}#{m[1]}]" unless m[0].empty?
- if meth and Traverse.method_defined? meth
- args = m[2..-1]
- else
- meth = "filter[#{m[0]}]"
- if Traverse.method_defined? meth
- args = m[1..-1]
- end
- end
- i = -1
- nodes = Elements[*nodes.find_all do |x|
- i += 1
- x.send(meth, *([*args] + [i])) ? truth : !truth
- end]
- end
+ end
+
+ i = -1
+ nodes = Elements[*nodes.find_all do |node|
+ i += 1
+ # like p, 0
+ # or alt, 200606131240, 68
+ filtering_args = ([*args] + [i])
+
+ begin
+ res = node.send(meth, *filtering_args)
+ rescue Exception => e
+ puts "DEBUG => Node: #{node}"
+ puts "DEBUG => Groups:#{match_groups.join(" ")}"
+ puts("DEBUG => Arguments to send: " + filtering_args.join(" "))
+ puts "DEBUG => Method: #{meth}"
+ puts "DEBUG => Result of send: #{res}\n\n"
+ end
+
+ res ? truth : !truth
+ end]
end
- [nodes, expr]
+ end
+ [nodes, expr]
end
+
+ def self.select_even_or_odd_nodes(nodes, oddity = :odd)
+ result = []
+ nodes.each_with_index { |node, i| result.push(node) if (i % 2 == (oddity.to_sym == :even ? 0 : 1)) }
+ result
+ end
+

# Given two elements, attempt to gather an Elements array of everything between
# (and including) those two elements.
@@ -345,164 +404,242 @@
end

def filter(expr)
- nodes, = Elements.filter(self, expr)
- nodes
+ nodes, = Elements.filter(self, expr)
+ nodes
end

def not(expr)
- if expr.is_a? Traverse
- nodes = self - [expr]
- else
- nodes, = Elements.filter(self, expr, false)
- end
- nodes
+ if expr.is_a? Traverse
+ nodes = self - [expr]
+ else
+ nodes, = Elements.filter(self, expr, false)
+ end
+ nodes
end

private
def copy_node(node, l)
- l.instance_variables.each do |iv|
- node.instance_variable_set(iv, l.instance_variable_get(iv))
- end
+ l.instance_variables.each do |iv|
+ node.instance_variable_set(iv, l.instance_variable_get(iv))
+ end
end

end

module Traverse
- def self.filter(tok, &blk)
- define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
+ # Defines methods doing actual has_elements_filtering. Hpricot.has_elements_filter uses it to has_elements_filter out elements.
+ def self.has_elements_filter(token, &blk)
+ # Possible methods to be defined:
+ #
+ # filter[]
+ # filter[#]
+ # filter[.]
+ # filter[:lt]
+ # filter[:gt]
+ # filter[:nth]
+ # filter[:eq]
+ # filter[:nth-of-type]
+ # filter[:first]
+ # filter[:first-of-type]
+ # filter[:last]
+ # filter[:last-of-type]
+ # filter[:even]
+ # filter[:odd]
+ # filter[:first-child]
+ # filter[:nth-child]
+ # filter[:last-child]
+ # filter[:nth-last-of-type]
+ # filter[:nth-last-child]
+ # filter[:only-of-type]
+ # filter[:only-child]
+ # filter[:parent]
+ # filter[:root]
+ # filter[:empty]
+ # filter[text]
+ # filter[comment]
+ # filter[:contains]
+ #
+ # The following filters operate on attributes:
+ #
+ # filter[@=]
+ # filter[@!=]
+ # filter[@~=]
+ # filter[@|=]
+ # filter[@^=]
+ # filter[@$=]
+ # filter[@*=]
+ #
+ # These operate on text:
+ #
+ # filter[text()=]
+ # filter[text()!=]
+ # filter[text()~=]
+ # filter[text()|=]
+ # filter[text()^=]
+ # filter[text()$=]
+ # filter[text()*=]
+ define_method("filter[#{stringify_filter_token(token)}]", &blk)
end
-
- filter '' do |name,i|
+
+ def self.stringify_filter_token(token)
+ token.is_a?(String) ? token : token.inspect
+ end
+
+ # get elements by tag name (case insensitive)
+ has_elements_filter '' do |name, node_position|
name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
end
-
- filter '#' do |id,i|
- self.elem? and get_attribute('id').to_s == id
+
+ # get element by element id
+ has_elements_filter '#' do |id, i|
+ self.elem? && (get_attribute('id').to_s == id)
end
-
- filter '.' do |name,i|
- self.elem? and classes.include? name
+
+ # get elements with given class
+ has_elements_filter '.' do |css_class, node_position|
+ self.elem? && classes.include?(css_class)
end
-
- filter :lt do |num,i|
- self.position < num.to_i
+
+ # get elements by position (elements with position greater than <b>i</b> are left out)
+ has_elements_filter :lt do |node_position, _|
+ self.position < node_position.to_i
end

- filter :gt do |num,i|
- self.position > num.to_i
+ has_elements_filter :gt do |node_position, _|
+ self.position > node_position.to_i
end

- nth = proc { |num,i| self.position == num.to_i }
- nth_first = proc { |*a| self.position == 0 }
- nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
-
- filter :nth, &nth
- filter :eq, &nth
- filter ":nth-of-type", &nth
-
- filter :first, &nth_first
- filter ":first-of-type", &nth_first
-
- filter :last, &nth_last
- filter ":last-of-type", &nth_last
-
- filter :even do |num,i|
- self.position % 2 == 0
- end
-
- filter :odd do |num,i|
- self.position % 2 == 1
- end
-
- filter ':first-child' do |i|
+ nth = proc { |node_position, _| self.position == node_position.to_i }
+ nth_first = proc { |*arguments_we_dont_care_about_here| self.position == 0 }
+ nth_last = proc { |*arguments_we_dont_care_about_here| self == parent.children_of_type(self.name).last }
+ evens = proc { |*arguments_we_dont_care_about_here| self.position % 2 == 0 }
+ odds = proc { |*arguments_we_dont_care_about_here| self.position % 2 == 1 }
+
+ # get nth element in collection
+ has_elements_filter :nth, &nth
+ has_elements_filter :eq, &nth
+ has_elements_filter ":nth-of-type", &nth
+
+ # get first element in collection
+ has_elements_filter :first, &nth_first
+ has_elements_filter ":first-of-type", &nth_first
+
+ # get last element in collection
+ has_elements_filter :last, &nth_last
+ has_elements_filter ":last-of-type", &nth_last
+
+ # get only even elements
+ has_elements_filter :even, &evens
+ # get only odd elements
+ has_elements_filter :odd, &odds
+
+ # get only elements that are first children of respective parents
+ has_elements_filter ':first-child' do
self == parent.containers.first
end
-
- filter ':nth-child' do |arg,i|
- case arg
- when 'even'; (parent.containers.index(self) + 1) % 2 == 0
- when 'odd'; (parent.containers.index(self) + 1) % 2 == 1
- else self == (parent.containers[arg.to_i + 1])
- end
+
+ # get only elements that are nth children of respective parents
+ has_elements_filter ':nth-child' do |node_position, i|
+ self == (parent.containers[node_position.to_i + 1])
end
-
- filter ":last-child" do |i|
+
+ # get only elements that are last children of respective parents
+ has_elements_filter ":last-child" do
self == parent.containers.last
end

- filter ":nth-last-child" do |arg,i|
- self == parent.containers[-1-arg.to_i]
+ # get only elements that are nth last (like nth from the opposite end) children of respective parents
+ has_elements_filter ":nth-last-child" do |node_position, _|
+ self == parent.containers[-1 - node_position.to_i]
end
-
- filter ":nth-last-of-type" do |arg,i|
- self == parent.children_of_type(self.name)[-1-arg.to_i]
+
+ # get only elements that are nth last (like nth from the opposite end) children of given type of respective parents
+ has_elements_filter ":nth-last-of-type" do |node_position, _|
+ self == parent.children_of_type(self.name)[-1 - node_position.to_i]
end
-
- filter ":only-of-type" do |arg,i|
+
+ # get only elements that are of given type
+ has_elements_filter ":only-of-type" do
parent.children_of_type(self.name).length == 1
end
-
- filter ":only-child" do |arg,i|
+
+ # get only child elements
+ has_elements_filter ":only-child" do |*ignore|
parent.containers.length == 1
end
-
- filter :parent do
+
+ has_elements_filter ":only-children" do |*ignore|
+ parent.containers.length == 1
+ end
+
+ # get only parent elements
+ has_elements_filter :parent do |*ignore|
containers.length > 0
end
-
- filter :empty do
+
+ # get only empty elements (without children at all)
+ has_elements_filter :empty do |*ignore|
containers.length == 0
end
-
- filter :root do
+
+ # get root element
+ has_elements_filter :root do |*ignore|
self.is_a? Hpricot::Doc
end

- filter 'text' do
+ # get text elements
+ has_elements_filter 'text' do |*ignore|
self.text?
end
-
- filter 'comment' do
+
+ # get comment elements
+ has_elements_filter 'comment' do |*ignore|
self.comment?
end
-
- filter :contains do |arg, ignore|
+
+ # get elements containing given tag
+ has_elements_filter :contains do |arg, ignore|
html.include? arg
end
+
+ predicate_procs = {
+ 'text()' => proc { |ele, *_| ele.inner_text.strip },
+ '@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }
+ }

+ operations_procs = {
+ '=' => proc { |a,b| a == b },
+ '!=' => proc { |a,b| a != b },
+ '~=' => proc { |a,b| a.split(/\s+/).include?(b) },
+ '|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
+ '^=' => proc { |a,b| a.index(b) == 0 },
+ '$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
+ '*=' => proc { |a,b| idx = a.index(b) }
+ }

-
- pred_procs =
- {'text()' => proc { |ele, *_| ele.inner_text.strip },
- '@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
-
- oper_procs =
- {'=' => proc { |a,b| a == b },
- '!=' => proc { |a,b| a != b },
- '~=' => proc { |a,b| a.split(/\s+/).include?(b) },
- '|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
- '^=' => proc { |a,b| a.index(b) == 0 },
- '$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
- '*=' => proc { |a,b| idx = a.index(b) }}
-
- pred_procs.each do |pred_n, pred_f|
- oper_procs.each do |oper_n, oper_f|
- filter "#{pred_n}#{oper_n}" do |*a|
- qual = pred_f[self, *a]
- oper_f[qual, a[-2]] if qual
+ # construct filters like @=, @!=, text()~=, text()*= and so forth
+ predicate_procs.each do |predicate_name, predicate_function|
+ operations_procs.each do |operation_name, operation_function|
+
+ has_elements_filter "#{predicate_name}#{operation_name}" do |*a|
+ qual = predicate_function[self, *a]
+ operation_function[qual, a[-2]] if qual
end
+
end
end
-
- filter 'text()' do |val,i|
+
+ # get elements containing text
+ has_elements_filter 'text()' do |val, i|
!self.inner_text.strip.empty?
end
-
- filter '@' do |attr,val,i|
+
+ # get elements that have given attribute
+ has_elements_filter '@' do |attr, val, i|
self.elem? and has_attribute? attr
end
-
- filter '[' do |val,i|
+
+ has_elements_filter '[' do |val, i|
self.elem? and search(val).length > 0
end