Examples

I invite to check the list of example in the repository that can give you real world example of spiders that are used to scrape data from website.

Here is an example of a spider for StackOverFlow website which should be used like this:

$ python stackoverflow.py --config=config.json

# -*- encoding: utf8 -*-
"""Crawler for Stackoverflow website.

http://stackoverflow.com/questions/tagged/python

The Stackoverflow represent a perfect example of what most website look like,
which is a:

    * A **list** page(s) that contain a summary of all items.
    * The list page(s) are divided in multiple page using a **pagination**.
    * Each item in the list page(s) contain a link for a **detail** page that
      contain extra data.

So using crawly you can structure your crawler in the given form.

    1) A ``WebSite`` subclass that define the front structure of the website,
    which is : Pagination, ListPage
    2) The ListPage is a subclass of a ``WebPage`` which define links
    ``tofollow`` to get DetailPage and data ``toextract`` from the list page.
    3) The DetailPage is the final page which contain the data ``toextract``.

"""
import sys; sys.path.append('..')  # Include crawly in system path.

import json

import requests
from gevent.coros import Semaphore

from crawly import XPath, HTML, WebSite, WebPage, Pagination, runner

# If this was a real crawler for StackOverFlow it will be better if we
# changed the `pagesize` parameter in the url to 50 (maximum) to make
# the least possible queries.
URL = "http://stackoverflow.com/questions/tagged/python?&sort=newest&pagesize=15"
FILENAME = 'questions.json'  # File used to dump crawled pages.
NEW_DATA = {}                # Hold new extracted data.
LOCK = Semaphore()           # Synchronize write to NEW_DATA.
try:
    data = json.load(open(FILENAME))
except IOError:              # First time run, file doesn't exist yet.
    CRAWLED = {}
else:
    CRAWLED = set(data)      # Get a list of crawled URLs.
    del data


def _get_end():
    "Get last page to crawl."
    response = requests.get(URL)
    last = int(
        HTML(response.content).extract(
            '//span[@class="page-numbers"]/text()'
        )[-1]  # -1 for last page.
    )
    runner.log('Number of pages detected is: %d' % last)
    return 1  # XXX: You can ``return last`` to crawl all the website.


class QuestionPage(WebPage):

    # All data extracted here can be extracted from the list page but
    # i preferred to do it here to show how you can use crawly to follow
    # links and extract data from this later.
    toextract = {
        'title': '//div[@id="question-header"]/h1/a/text()',
        'user_name': '//div[@id="question"]//div[@class="user-details"]/a/text()',
        'datetime': '//div[@id="question"]//div[@class="user-action-time"]/span/@title',
        'tags': '//div[@id="question"]//a[@class="post-tag"]/text()',
        'accepted': XPath(
            '//span[starts-with(@class, "vote-accepted-on")]',
            bool
        )
    }


class ListPage(WebPage):

    tofollow = {
        'links': '//div[@id="questions"]//a[@class="question-hyperlink"]/@href',
        'vote': '//span[@class="vote-count-post"]/strong/text()',
        'answers_count': '//div[@class="question-summary"]//div[starts-with(@class, "status")]/strong/text()',
    }
    WebPageCls = QuestionPage


class StackOverFlow(WebSite):

    url = URL
    Pagination = Pagination(
        URL,
        data={'page': '{page}'},
        end=_get_end()
    )
    WebPageCls = ListPage


def isnew(page):
    "Check that the url wasn't already crawled."
    # I am assuming that already crawled question don't change, and because
    # this spider crawl question in newest to oldest, so when ever crawly see
    # an URL that was already crawled, this mean that all URLs that will follow
    # was crawled too, so better to stop here.
    return page.url not in CRAWLED


def save(page):
    "Save extracted page in a list."
    # This function is run in a greenlet (b/c it's used as crawly pipeline) so
    # that explain why we are using a Lock here.
    LOCK.acquire()
    try:
        NEW_DATA[page.url] = page.data
    finally:
        LOCK.release()


def tojson():
    "Write extracted data in the JSON format to a file."
    old = {}
    try:
        old = json.load(open(FILENAME))
    except IOError:
        pass
    old.update(NEW_DATA)
    json.dump(old, open(FILENAME, 'w'), indent=4)
    runner.log('Dump all questions')


if __name__ == '__main__':
    runner.set_website(StackOverFlow).takewhile(isnew) \
          .add_pipeline(save).on_finish(tojson).start()

The example of configuration file (config.json) that instruct logging to log to console and a file.

{
    "logging": {
        "version": 1,
        "formatters": {
            "standard": {
                "format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
            }
        },
        "handlers": {
            "console": {
                "class": "logging.StreamHandler",
                "formatter": "standard",
                "stream": "ext://sys.stdout"
            },
            "file": {
                "class": "logging.handlers.RotatingFileHandler",
                "formatter": "standard",
                "filename": "/tmp/crawly.log",
                "maxBytes": 1000000,
                "backupCount": 3
            }
        },
        "root": {
            "handlers": ["console", "file"],
            "level": "INFO"
        }
    }
}

Project Versions

Previous topic

API

Next topic

FAQ

This Page