1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | douban.py import scrapy from douban250.items import Douban250Item class DoubanSpider(scrapy.Spider): name = 'douban' allowed_domains = ['douban.com'] start_urls = ['https://movie.douban.com/top250'] def parse(self, response): mation_all = response.xpath("//ol[@class='grid_view']/li") for mation in mation_all: #排名 ranking = mation.xpath(".//div[@class='pic']/em/text()").getall() #电影名 name = mation.xpath(".//div[@class='hd']/a/span[1]/text()").getall() #评分 score = mation.xpath(".//span[@class='rating_num']/text()").getall() #评论人数 number = mation.xpath(".//div[@class='star']/span[4]/text()").getall() #短评 commentary = mation.xpath(".//span[@class='inq']/text()").getall() #链接 link = mation.xpath(".//div[@class='hd']/a/@href").getall() #定义item item = Douban250Item( ranking=ranking,name=name,score=score,number=number,commentary=commentary,link=link ) #返回item yield item #下一页url的text信息 next_page_url = response.xpath("//span[@class='next']/a/@href").get() #判断是否有下一页 if next_page_url: #下一页真实url next_url = response.urljoin(next_page_url) #返回parse函数,获取第二页信息 yield scrapy.Request(url=next_url,callback=self.parse) |
pipelines.py:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | import pymysql class Douban250Pipeline(object): def __init__(self): #链接参数 dbparams = { 'host':'127.0.0.1', 'user':'root', 'password':'root', 'port':3306, 'database':'moive_top250', 'charset':'utf8' } #建立链接,传入链接参数 self.connect=pymysql.connect(**dbparams) #创建游标操作数据库 self.cursor = self.connect.cursor() #创建一个sql语句 self._sql = None def process_item(self, item, spider): self.cursor.execute(self.sql,(item['ranking'],item['name'],item['score'],item['number'],item['commentary'],item['link'])) self.connect.commit() return item #定义sql语句为属性 #ranking,name,score,number,commentary,link @property def sql(self): if not self._sql: self._sql=""" insert into moive_top(id,ranking,name,score,number,commentary,link) values (NULL,%s,%s,%s,%s,%s,%s) """ return self._sql return self._sql |
start.py:
1 2 | from scrapy import cmdline cmdline.execute("scrapy crawl douban".split()) |
数据库效果图: