基于scrapy框架爬取新浪体育部分板块内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import scrapy
from selenium import webdriver
from sohuPro.items import SohuproItem
class SohuSpider(scrapy.Spider):
    name = 'sohu'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://sports.sina.com.cn/']
    #需求:爬取新浪体育欧冠,西甲,意甲,德甲等5大板块中的新闻内容
    models_urls=[] #用来存储5大板块对应的url链接
    def __init__(self):
        self.bro=webdriver.Chrome(executable_path='D:\python\Reptiliane\爬虫\chromedriver.exe')
    def parse(self, response):

        #1.先对首页发起请求,解析出5大板块对应的url链接
        li_list=response.xpath('//*[@id="j_top"]/div[2]/div/ul[1]/li')
        print(len(li_list))
        #2.从li_list中解析出5大板块对应的url
        alist = [0,1,2, 3, 4]
        for index in alist:
            model_url=li_list[index].xpath('./a/@href').extract_first()
            print(model_url)
            self.models_urls.append(model_url)
        #对每一个板块的url进行请求
        for url in self.models_urls:
            yield scrapy.Request(url=url,callback=self.parse_detail)


        #每个板块的内容不是动态生成的,可以直接请求,为了熟悉中间件的作用,所以我们用selenium模块进行请求
    def parse_detail(self,response):
        li_list=response.xpath('//*[@id="contest_list"]/li')
        for li in li_list:
            game_time=li.xpath('./p/text()').extract_first()
            game_team=li.xpath('./a//text()').extract_first()

            #实例化item对象,进行持久化存储
            item=SohuproItem()
            item['game_time']=game_time
            item['game_team']=game_team
            yield item #提交给管道

    def close_spider(self,spider):
        #浏览器关闭
        self.bro.close()

1
2
3
4
5
items.py
import scrapy
class SohuproItem(scrapy.Item):
    game_time=scrapy.Field()
    game_team=scrapy.Field()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
middlewares.py
from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
import time
from scrapy.http import HtmlResponse
class SohuproDownloaderMiddleware:
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    #拦截所有请求的response响应,进行筛选,将需求中的板块链接进行篡改
    def process_response(self, request, response, spider):#spider:爬虫类
        bro=spider.bro #实例化了浏览器对象
        for request_url in spider.models_urls:
            bro.get(request_url)
            time.sleep(3)
            page_text=bro.page_source#包含了动态加载的内容

            #实例化一个新的响应对象(需求:包含5大板块的内容),代替原来旧的响应对象
            new_response=HtmlResponse(url=request_url,body=page_text,encoding='utf8',request=request)
            time.sleep(3)

            return new_response
        else:
            return response


    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
pipelines.py
from itemadapter import ItemAdapter
import pymysql

class SohuproPipeline:
    def open_spider(self,spider):
        self.conn=pymysql.connect(
            host='localhost',
            user='root',
            password='root',
            port=3306,
            db='scrapy',
            charset='utf8'
        )
        print('爬虫开始!!!')
    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()
        value=(item['game_time'],item['game_team'])
        try:
            sql='insert into job_7(game_time,game_team) value(%s,%s)'
            self.cursor.execute(sql,value)
            self.conn.commit()
            print('插入成功')
        except:
            print('插入失败!!')
        return item

    def close_spider(self, spider):
        print('爬虫结束!!')
        self.cursor.close()
        self.conn.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
settings.py
# Scrapy settings for sohuPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'sohuPro'

SPIDER_MODULES = ['sohuPro.spiders']
NEWSPIDER_MODULE = 'sohuPro.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {<!-- -->
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {<!-- -->
#    'sohuPro.middlewares.SohuproSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {<!-- -->
   'sohuPro.middlewares.SohuproDownloaderMiddleware': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {<!-- -->
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {<!-- -->
   'sohuPro.pipelines.SohuproPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'