google-site-verification: googlebaca44933768a824.html [Python 2.7] Google file crawler - Old Royal Hack Forum

Announcement

Collapse
No announcement yet.

[Python 2.7] Google file crawler

Collapse
X
 
  • Filter
  • Time
  • Show
Clear All
new posts

    [Python 2.7] Google file crawler

    Heres a class i coded for searching google for files like mp3's pdf etc.
    I wrote this test some things with mechanize and BeautifulSoup. Both are fantastic libs btw.

    If Google asks for a captcha, the class calls the specific callback function. In my test case i just save the captcha as a file and the user have to enter the captcha code - then the program continues.

    Heres the code:
    Code:
    import re
    
    from BeautifulSoup import BeautifulSoup
    from mechanize import Browser, HTTPError, urljoin
    
    class GoogleCrawler(object):
        def __init__(self):
            br = Browser()
            br.set_handle_redirect(True)
            br.set_handle_equiv(True)
            br.set_handle_referer(True)
            br.set_handle_robots(False)
            br.addheaders = \
            [("User-agent", "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13")]
            self.br = br
            self.url_regex = "^http[s]*\:\/\/(?![\w\.]+\.google)(?!www\.youtube).+"
            self.urls = []
            self.file_link_pairs = []
            br.open("http://www.google.com/")
    
        def _build_absolute_path(self, url, root):
            return urljoin(root, url)
    
        def _get_links(self, text, filetype):
            soup = BeautifulSoup(text)
            return [(tag["href"], tag.text) for tag in soup.findAll(href=re.compile(".+\.%s$" % filetype))]
    
        def _next_page(self, page):
            try:
                self.br.follow_link(text=page)
            except Exception, e:
                print e.message
    
        def search(self, terms, filetype, captcha_callback, google_site_count=1, strict=False, url_callback=None, file_callback=None):
            self.urls = []
            self.file_link_pairs = []
    
            if strict:
                search_string = "intitle:\"index of\" +intext:\"parent directory\" +intext:\"%s\"" % terms
            else:
                google_terms = " +".join(["intext:\"%s\"" % t for t in terms.split()])
                search_string = "intitle:\"index of\" +intext:\"parent directory\" +" + google_terms
            self.br.select_form(nr=0)
            self.br["q"] = search_string
    
            try:
                self.br.submit()
            except HTTPError: # captcha requested
                soup = BeautifulSoup(self.br.response().read())
                img_url = soup.find("img")["src"]
                img = self.br.open_novisit(img_url).read()
    
                captcha_code = captcha_callback(img)
                self.br.select_form(nr=0)
                self.br["captcha"] = captcha_code
                try:
                    self.br.submit()
                except Exception, e:
                    print e.message
                    return
    
            for page in range(2, 2 + google_site_count):
                # fetch urls that may contain some files
                urls = [u.url for u in self.br.links(url_regex=self.url_regex)]
                for u in urls:
                    self.urls.append(u)
                    if url_callback:
                        url_callback(u)
    
                # iterate through urls for fetching the data
                for url in urls:
                    try:
                        content = Browser().open(url).read()
                    except Exception, e: # site not available
                        print e.message
                        continue
    
                    # site available, get pairs from site
                    if content:
                        pairs = self._get_links(content, filetype)
                        if not pairs:
                            continue
                        for href, text in pairs:
                            if not href.startswith("http"): # no absolute url
                                href = self._build_absolute_path(href, url)
    
                            self.file_link_pairs.append((href, text))
                            if file_callback:
                                file_callback(href, text)
                self._next_page(str(page)) # go to next google page
    Heres a example that prints all the results:
    Code:
    def _captcha_callback(img):
        with open("captcha.jpg", "wb") as f:
            f.write(img)
        code = raw_input("Captcha written to captcha.jpg. Code: ")
        return code
    
    counter = 1
    def _file_callback(link, name):
        global counter
        print "%d. %s: %s" % (counter, name, link)
        counter += 1
    
    c = GoogleCrawler()
    c.search("instruction", "pdf", google_site_count=2, captcha_callback=_captcha_callback, file_callback=_file_callback)
    I think im going to create a gui with PyQt and add some methods for downloading the files.


    Any suggestions or criticism?
    Foobar
Working...
X