D瓣TOP250


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
douban.py

import scrapy
from douban250.items import Douban250Item

class DoubanSpider(scrapy.Spider):
    name = 'douban'
    allowed_domains = ['douban.com']
    start_urls = ['https://movie.douban.com/top250']

    def parse(self, response):
        mation_all = response.xpath("//ol[@class='grid_view']/li")
        for mation in mation_all:
            #排名
            ranking = mation.xpath(".//div[@class='pic']/em/text()").getall()
            #电影名
            name = mation.xpath(".//div[@class='hd']/a/span[1]/text()").getall()
            #评分
            score = mation.xpath(".//span[@class='rating_num']/text()").getall()
            #评论人数
            number = mation.xpath(".//div[@class='star']/span[4]/text()").getall()
            #短评
            commentary = mation.xpath(".//span[@class='inq']/text()").getall()
            #链接
            link = mation.xpath(".//div[@class='hd']/a/@href").getall()
            #定义item
            item = Douban250Item(
                ranking=ranking,name=name,score=score,number=number,commentary=commentary,link=link
            )
            #返回item
            yield item
        #下一页url的text信息
        next_page_url = response.xpath("//span[@class='next']/a/@href").get()
        #判断是否有下一页
        if next_page_url:
            #下一页真实url
            next_url = response.urljoin(next_page_url)
            #返回parse函数,获取第二页信息
            yield scrapy.Request(url=next_url,callback=self.parse)

pipelines.py:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pymysql

class Douban250Pipeline(object):
    def __init__(self):
        #链接参数
        dbparams = {
            'host':'127.0.0.1',
            'user':'root',
            'password':'root',
            'port':3306,
            'database':'moive_top250',
            'charset':'utf8'
        }
        #建立链接,传入链接参数
        self.connect=pymysql.connect(**dbparams)
        #创建游标操作数据库
        self.cursor = self.connect.cursor()
        #创建一个sql语句
        self._sql = None

    def process_item(self, item, spider):
        self.cursor.execute(self.sql,(item['ranking'],item['name'],item['score'],item['number'],item['commentary'],item['link']))
        self.connect.commit()
        return item

    #定义sql语句为属性
    #ranking,name,score,number,commentary,link
    @property
    def sql(self):
        if not self._sql:
            self._sql="""
               insert into moive_top(id,ranking,name,score,number,commentary,link) values (NULL,%s,%s,%s,%s,%s,%s)
            """
            return self._sql
        return self._sql

start.py:

1
2
from scrapy import cmdline
cmdline.execute("scrapy crawl douban".split())

数据库效果图: