Not all the markup in HTML is needed. We need to remove them first. The following is based on the markup used in Informl.
# Function: scrape_page.rb
def scrape_the_page(pagePath,oFile,hFile)
items_to_remove = [
"#menus", #menus notice
"div.markedup",
"div.navigation",
"head", #table of contents
"hr"
]
doc=Hpricot(open(pagePath))
@article = (doc/"#container").each do |content|
#remove unnecessary content and edit links
items_to_remove.each { |x| (content/x).remove }
end