基于scrapy框架爬取新浪体育部分板块内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

import scrapy
from selenium import webdriver
from sohuPro.items import SohuproItem
class SohuSpider(scrapy.Spider):
name = 'sohu'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://sports.sina.com.cn/']
#需求：爬取新浪体育欧冠,西甲，意甲，德甲等5大板块中的新闻内容
models_urls=[] #用来存储5大板块对应的url链接
def __init__(self):
self.bro=webdriver.Chrome(executable_path='D:\python\Reptiliane\爬虫\chromedriver.exe')
def parse(self, response):

#1.先对首页发起请求，解析出5大板块对应的url链接
li_list=response.xpath('//*[@id="j_top"]/div[2]/div/ul[1]/li')
print(len(li_list))
#2.从li_list中解析出5大板块对应的url
alist = [0,1,2, 3, 4]
for index in alist:
model_url=li_list[index].xpath('./a/@href').extract_first()
print(model_url)
self.models_urls.append(model_url)
#对每一个板块的url进行请求
for url in self.models_urls:
yield scrapy.Request(url=url,callback=self.parse_detail)

#每个板块的内容不是动态生成的，可以直接请求，为了熟悉中间件的作用，所以我们用selenium模块进行请求
def parse_detail(self,response):
li_list=response.xpath('//*[@id="contest_list"]/li')
for li in li_list:
game_time=li.xpath('./p/text()').extract_first()
game_team=li.xpath('./a//text()').extract_first()

#实例化item对象，进行持久化存储
item=SohuproItem()
item['game_time']=game_time
item['game_team']=game_team
yield item #提交给管道

def close_spider(self,spider):
#浏览器关闭
self.bro.close()

1
2
3
4
5

items.py
import scrapy
class SohuproItem(scrapy.Item):
game_time=scrapy.Field()
game_team=scrapy.Field()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

middlewares.py
from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
import time
from scrapy.http import HtmlResponse
class SohuproDownloaderMiddleware:
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.

# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
#拦截所有请求的response响应，进行筛选，将需求中的板块链接进行篡改
def process_response(self, request, response, spider):#spider:爬虫类
bro=spider.bro #实例化了浏览器对象
for request_url in spider.models_urls:
bro.get(request_url)
time.sleep(3)
page_text=bro.page_source#包含了动态加载的内容

#实例化一个新的响应对象(需求：包含5大板块的内容)，代替原来旧的响应对象
new_response=HtmlResponse(url=request_url,body=page_text,encoding='utf8',request=request)
time.sleep(3)

return new_response
else:
return response

def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.

# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

pipelines.py
from itemadapter import ItemAdapter
import pymysql

class SohuproPipeline:
def open_spider(self,spider):
self.conn=pymysql.connect(
host='localhost',
user='root',
password='root',
port=3306,
db='scrapy',
charset='utf8'
)
print('爬虫开始！！！')
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
value=(item['game_time'],item['game_team'])
try:
sql='insert into job_7(game_time,game_team) value(%s,%s)'
self.cursor.execute(sql,value)
self.conn.commit()
print('插入成功')
except:
print('插入失败！！')
return item

def close_spider(self, spider):
print('爬虫结束！！')
self.cursor.close()
self.conn.close()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

settings.py
# Scrapy settings for sohuPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'sohuPro'

SPIDER_MODULES = ['sohuPro.spiders']
NEWSPIDER_MODULE = 'sohuPro.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'sohuPro.middlewares.SohuproSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'sohuPro.middlewares.SohuproDownloaderMiddleware': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'sohuPro.pipelines.SohuproPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'