[Python 2.7] Google file crawler

1234588

Moderator

Join Date: Feb 2008
Posts: 157

11-21-2011, 03:45 PM

Heres a class i coded for searching google for files like mp3's pdf etc.
I wrote this test some things with mechanize and BeautifulSoup. Both are fantastic libs btw.

If Google asks for a captcha, the class calls the specific callback function. In my test case i just save the captcha as a file and the user have to enter the captcha code - then the program continues.

Heres the code:

Code:

import re

from BeautifulSoup import BeautifulSoup
from mechanize import Browser, HTTPError, urljoin

class GoogleCrawler(object):
    def __init__(self):
        br = Browser()
        br.set_handle_redirect(True)
        br.set_handle_equiv(True)
        br.set_handle_referer(True)
        br.set_handle_robots(False)
        br.addheaders = \
        [("User-agent", "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13")]
        self.br = br
        self.url_regex = "^http[s]*\:\/\/(?![\w\.]+\.google)(?!www\.youtube).+"
        self.urls = []
        self.file_link_pairs = []
        br.open("http://www.google.com/")

    def _build_absolute_path(self, url, root):
        return urljoin(root, url)

    def _get_links(self, text, filetype):
        soup = BeautifulSoup(text)
        return [(tag["href"], tag.text) for tag in soup.findAll(href=re.compile(".+\.%s$" % filetype))]

    def _next_page(self, page):
        try:
            self.br.follow_link(text=page)
        except Exception, e:
            print e.message

    def search(self, terms, filetype, captcha_callback, google_site_count=1, strict=False, url_callback=None, file_callback=None):
        self.urls = []
        self.file_link_pairs = []

        if strict:
            search_string = "intitle:\"index of\" +intext:\"parent directory\" +intext:\"%s\"" % terms
        else:
            google_terms = " +".join(["intext:\"%s\"" % t for t in terms.split()])
            search_string = "intitle:\"index of\" +intext:\"parent directory\" +" + google_terms
        self.br.select_form(nr=0)
        self.br["q"] = search_string

        try:
            self.br.submit()
        except HTTPError: # captcha requested
            soup = BeautifulSoup(self.br.response().read())
            img_url = soup.find("img")["src"]
            img = self.br.open_novisit(img_url).read()

            captcha_code = captcha_callback(img)
            self.br.select_form(nr=0)
            self.br["captcha"] = captcha_code
            try:
                self.br.submit()
            except Exception, e:
                print e.message
                return

        for page in range(2, 2 + google_site_count):
            # fetch urls that may contain some files
            urls = [u.url for u in self.br.links(url_regex=self.url_regex)]
            for u in urls:
                self.urls.append(u)
                if url_callback:
                    url_callback(u)

            # iterate through urls for fetching the data
            for url in urls:
                try:
                    content = Browser().open(url).read()
                except Exception, e: # site not available
                    print e.message
                    continue

                # site available, get pairs from site
                if content:
                    pairs = self._get_links(content, filetype)
                    if not pairs:
                        continue
                    for href, text in pairs:
                        if not href.startswith("http"): # no absolute url
                            href = self._build_absolute_path(href, url)

                        self.file_link_pairs.append((href, text))
                        if file_callback:
                            file_callback(href, text)
            self._next_page(str(page)) # go to next google page

Heres a example that prints all the results:

Code:

def _captcha_callback(img):
    with open("captcha.jpg", "wb") as f:
        f.write(img)
    code = raw_input("Captcha written to captcha.jpg. Code: ")
    return code

counter = 1
def _file_callback(link, name):
    global counter
    print "%d. %s: %s" % (counter, name, link)
    counter += 1

c = GoogleCrawler()
c.search("instruction", "pdf", google_site_count=2, captcha_callback=_captcha_callback, file_callback=_file_callback)

I think im going to create a gui with PyQt and add some methods for downloading the files.

Any suggestions or criticism?

Foobar

Tags: None

Announcement

[Python 2.7] Google file crawler

[Python 2.7] Google file crawler