import mechanicalsoup
import time
import re
def stripGoogleHeader(soup):
soup.find(id="google-cache-hdr").decompose()
return soup
browser = mechanicalsoup.StatefulBrowser(user_agent="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36")
# Edit this to save a different thread
threadname = "3199-recoveries"
url = "http://webcache.googleusercontent.com/search?q=cache:http://www.swolesource.com/forum/post-finasteride-syndrome/{0}.html".format(threadname)
baseurl = "http://webcache.googleusercontent.com/search?q=cache:http://www.swolesource.com/forum/post-finasteride-syndrome/{0}".format(threadname) + "-{0}.html"
print("Scraping page 1: {0}".format(url))
browser.open(url)
# Figure out amt of pages
rgx = re.compile("Page \d of (?P<pageno>\d)")
amtpages = int(re.search(rgx, browser.get_current_page()\
.find("a", string=re.compile("Page \d of \d"))\
.string).group("pageno"))
print("Total of {0} pages found".format(amtpages))
# Save first page
with open('{0}-{1}.html'.format(threadname, 1), 'w') as file:
file.write(str(stripGoogleHeader(browser.get_current_page())))
# Save the rest of the pages
for i in range(2, amtpages + 1):
time.sleep(15)
print("Scraping page {0}: {1}".format(i, baseurl.format(i)))
browser.open(baseurl.format(i))
with open('{0}-{1}.html'.format(threadname, i), 'w') as file:
file.write(str(stripGoogleHeader(browser.get_current_page())))