I wrote this test some things with mechanize and BeautifulSoup. Both are fantastic libs btw.
If Google asks for a captcha, the class calls the specific callback function. In my test case i just save the captcha as a file and the user have to enter the captcha code - then the program continues.
Heres the code:
import re from BeautifulSoup import BeautifulSoup from mechanize import Browser, HTTPError, urljoin class GoogleCrawler(object): def __init__(self): br = Browser() br.set_handle_redirect(True) br.set_handle_equiv(True) br.set_handle_referer(True) br.set_handle_robots(False) br.addheaders = \ [("User-agent", "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13")] self.br = br self.url_regex = "^http[s]*\:\/\/(?![\w\.]+\.google)(?!www\.youtube).+" self.urls = [] self.file_link_pairs = [] br.open("http://www.google.com/") def _build_absolute_path(self, url, root): return urljoin(root, url) def _get_links(self, text, filetype): soup = BeautifulSoup(text) return [(tag["href"], tag.text) for tag in soup.findAll(href=re.compile(".+\.%s$" % filetype))] def _next_page(self, page): try: self.br.follow_link(text=page) except Exception, e: print e.message def search(self, terms, filetype, captcha_callback, google_site_count=1, strict=False, url_callback=None, file_callback=None): self.urls = [] self.file_link_pairs = [] if strict: search_string = "intitle:\"index of\" +intext:\"parent directory\" +intext:\"%s\"" % terms else: google_terms = " +".join(["intext:\"%s\"" % t for t in terms.split()]) search_string = "intitle:\"index of\" +intext:\"parent directory\" +" + google_terms self.br.select_form(nr=0) self.br["q"] = search_string try: self.br.submit() except HTTPError: # captcha requested soup = BeautifulSoup(self.br.response().read()) img_url = soup.find("img")["src"] img = self.br.open_novisit(img_url).read() captcha_code = captcha_callback(img) self.br.select_form(nr=0) self.br["captcha"] = captcha_code try: self.br.submit() except Exception, e: print e.message return for page in range(2, 2 + google_site_count): # fetch urls that may contain some files urls = [u.url for u in self.br.links(url_regex=self.url_regex)] for u in urls: self.urls.append(u) if url_callback: url_callback(u) # iterate through urls for fetching the data for url in urls: try: content = Browser().open(url).read() except Exception, e: # site not available print e.message continue # site available, get pairs from site if content: pairs = self._get_links(content, filetype) if not pairs: continue for href, text in pairs: if not href.startswith("http"): # no absolute url href = self._build_absolute_path(href, url) self.file_link_pairs.append((href, text)) if file_callback: file_callback(href, text) self._next_page(str(page)) # go to next google page
def _captcha_callback(img): with open("captcha.jpg", "wb") as f: f.write(img) code = raw_input("Captcha written to captcha.jpg. Code: ") return code counter = 1 def _file_callback(link, name): global counter print "%d. %s: %s" % (counter, name, link) counter += 1 c = GoogleCrawler() c.search("instruction", "pdf", google_site_count=2, captcha_callback=_captcha_callback, file_callback=_file_callback)
Any suggestions or criticism?