Report abuse

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
require 'nokogiri'

# Parse
doc = Nokogiri::HTML(File.read('test.html'))
root = doc.root

# Add sections for all headers
(1..6).each do |level|
  # For each header on this level
  root.css("h#{level}").each do |header|
    # Get all siblings
    siblings = header.parent.children

    # Remove previous siblings
    siblings_after = []
    should_include = false
    siblings.each do |sibling|
      if sibling == header
        should_include = true
      elsif should_include
        siblings_after << sibling
      end
    end

    # Remove next siblings that should not be part of this section
    siblings_in_between = []
    siblings_after.each do |sibling|
      if sibling.name =~ /^h(\d)/ && $1.to_i <= level
        break
      else
        siblings_in_between << sibling
      end
    end

    # Create section
    section = Nokogiri::XML::Node.new('section', doc)
    section['id'] = header.content.downcase.gsub(/[^\w\d\-_]+/, '-').sub(/-$/, '')
    header.add_previous_sibling(section)

    # Move children into section
    header.unlink
    section.add_child(header)
    siblings_in_between.each do |sibling|
      sibling.unlink
      section.add_child(sibling)
    end
  end
end

# Done
puts doc.to_s.gsub(' />', '>')