D瓣TOP250 | 码农家园

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

douban.py

import scrapy
from douban250.items import Douban250Item

class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['douban.com']
start_urls = ['https://movie.douban.com/top250']

def parse(self, response):
mation_all = response.xpath("//ol[@class='grid_view']/li")
for mation in mation_all:
#排名
ranking = mation.xpath(".//div[@class='pic']/em/text()").getall()
#电影名
name = mation.xpath(".//div[@class='hd']/a/span[1]/text()").getall()
#评分
score = mation.xpath(".//span[@class='rating_num']/text()").getall()
#评论人数
number = mation.xpath(".//div[@class='star']/span[4]/text()").getall()
#短评
commentary = mation.xpath(".//span[@class='inq']/text()").getall()
#链接
link = mation.xpath(".//div[@class='hd']/a/@href").getall()
#定义item
item = Douban250Item(
ranking=ranking,name=name,score=score,number=number,commentary=commentary,link=link
)
#返回item
yield item
#下一页url的text信息
next_page_url = response.xpath("//span[@class='next']/a/@href").get()
#判断是否有下一页
if next_page_url:
#下一页真实url
next_url = response.urljoin(next_page_url)
#返回parse函数，获取第二页信息
yield scrapy.Request(url=next_url,callback=self.parse)

pipelines.py:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

import pymysql

class Douban250Pipeline(object):
def __init__(self):
#链接参数
dbparams = {
'host':'127.0.0.1',
'user':'root',
'password':'root',
'port':3306,
'database':'moive_top250',
'charset':'utf8'
}
#建立链接,传入链接参数
self.connect=pymysql.connect(**dbparams)
#创建游标操作数据库
self.cursor = self.connect.cursor()
#创建一个sql语句
self._sql = None

def process_item(self, item, spider):
self.cursor.execute(self.sql,(item['ranking'],item['name'],item['score'],item['number'],item['commentary'],item['link']))
self.connect.commit()
return item

#定义sql语句为属性
#ranking,name,score,number,commentary,link
@property
def sql(self):
if not self._sql:
self._sql="""
insert into moive_top(id,ranking,name,score,number,commentary,link) values (NULL,%s,%s,%s,%s,%s,%s)
"""
return self._sql
return self._sql

start.py:

1 2	from scrapy import cmdline cmdline.execute("scrapy crawl douban".split())

数据库效果图：