1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | import scrapy from selenium import webdriver from sohuPro.items import SohuproItem class SohuSpider(scrapy.Spider): name = 'sohu' # allowed_domains = ['www.xxx.com'] start_urls = ['http://sports.sina.com.cn/'] #需求:爬取新浪体育欧冠,西甲,意甲,德甲等5大板块中的新闻内容 models_urls=[] #用来存储5大板块对应的url链接 def __init__(self): self.bro=webdriver.Chrome(executable_path='D:\python\Reptiliane\爬虫\chromedriver.exe') def parse(self, response): #1.先对首页发起请求,解析出5大板块对应的url链接 li_list=response.xpath('//*[@id="j_top"]/div[2]/div/ul[1]/li') print(len(li_list)) #2.从li_list中解析出5大板块对应的url alist = [0,1,2, 3, 4] for index in alist: model_url=li_list[index].xpath('./a/@href').extract_first() print(model_url) self.models_urls.append(model_url) #对每一个板块的url进行请求 for url in self.models_urls: yield scrapy.Request(url=url,callback=self.parse_detail) #每个板块的内容不是动态生成的,可以直接请求,为了熟悉中间件的作用,所以我们用selenium模块进行请求 def parse_detail(self,response): li_list=response.xpath('//*[@id="contest_list"]/li') for li in li_list: game_time=li.xpath('./p/text()').extract_first() game_team=li.xpath('./a//text()').extract_first() #实例化item对象,进行持久化存储 item=SohuproItem() item['game_time']=game_time item['game_team']=game_team yield item #提交给管道 def close_spider(self,spider): #浏览器关闭 self.bro.close() |
1 2 3 4 5 | items.py import scrapy class SohuproItem(scrapy.Item): game_time=scrapy.Field() game_team=scrapy.Field() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | middlewares.py from scrapy import signals # useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter import time from scrapy.http import HtmlResponse class SohuproDownloaderMiddleware: def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None #拦截所有请求的response响应,进行筛选,将需求中的板块链接进行篡改 def process_response(self, request, response, spider):#spider:爬虫类 bro=spider.bro #实例化了浏览器对象 for request_url in spider.models_urls: bro.get(request_url) time.sleep(3) page_text=bro.page_source#包含了动态加载的内容 #实例化一个新的响应对象(需求:包含5大板块的内容),代替原来旧的响应对象 new_response=HtmlResponse(url=request_url,body=page_text,encoding='utf8',request=request) time.sleep(3) return new_response else: return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | pipelines.py from itemadapter import ItemAdapter import pymysql class SohuproPipeline: def open_spider(self,spider): self.conn=pymysql.connect( host='localhost', user='root', password='root', port=3306, db='scrapy', charset='utf8' ) print('爬虫开始!!!') def process_item(self, item, spider): self.cursor = self.conn.cursor() value=(item['game_time'],item['game_team']) try: sql='insert into job_7(game_time,game_team) value(%s,%s)' self.cursor.execute(sql,value) self.conn.commit() print('插入成功') except: print('插入失败!!') return item def close_spider(self, spider): print('爬虫结束!!') self.cursor.close() self.conn.close() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | settings.py # Scrapy settings for sohuPro project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'sohuPro' SPIDER_MODULES = ['sohuPro.spiders'] NEWSPIDER_MODULE = 'sohuPro.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL='ERROR' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = {<!-- --> # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = {<!-- --> # 'sohuPro.middlewares.SohuproSpiderMiddleware': 543, # } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = {<!-- --> 'sohuPro.middlewares.SohuproDownloaderMiddleware': 543, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = {<!-- --> # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {<!-- --> 'sohuPro.pipelines.SohuproPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |