本文共 3863 字,大约阅读时间需要 12 分钟。
以上自行解决。
scrapy startprogect commit_spider
scrapy genspider myspider nvd.nist.gov
ROBOTSTXT_OBEY = TruePROXY_LIST = [{"ip_port": "http://211.137.52.158:8080"},{"ip_port": "http://111.47.154.34:53281"},{"ip_port": "http://183.220.145.3:80"},{"ip_port": "http://223.100.166.3:36945"},{"ip_port": "http://120.194.42.157:38185"},{"ip_port": "http://223.82.106.253:3128"},{"ip_port": "http://117.141.155.244:53281"},{"ip_port": "http://120.198.76.45:41443"},{"ip_port": "http://123.136.8.122:3128"},{"ip_port": "http://117.141.155.243:53281"},{"ip_port": "http://183.196.168.194:9000"},{"ip_port": "http://117.141.155.242:53281"},{"ip_port": "http://183.195.106.118:8118"},{"ip_port": "http://112.14.47.6:52024"},{"ip_port": "http://218.204.153.156:8080"},{"ip_port": "http://223.71.203.241:55443"},{"ip_port": "http://117.141.155.241:53281"},{"ip_port": "http://221.180.170.104:8080"},{"ip_port": "http://183.247.152.98:53281"},{"ip_port": "http://183.196.170.247:9000"},]UA_LIST = ['Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729); Windows NT 5.1; Trident/4.0)', 'Mozilla/4.0 (compatible; Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727); Windows NT 5.1; Trident/4.0; Maxthon; .NET CLR 2.0.50727; .NET CLR 1.1.4322; InfoPath.2)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB6; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)'] DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'en',}DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"SCHEDULER = "scrapy_redis.scheduler.Scheduler"SCHEDULER_PERSIST = TrueREDIS_URL = "redis://127.0.0.1:6379"
from commit_spider.settings import PROXY_LIST,UA_LIST
class RandomProxy(object): def process_request(self,request,spider): proxy = random.choice(PROXY_LIST) request.meta['proxy'] = proxy['ip_port']class RandomUserAgent(object): def process_request(self,request,spider): ua = random.choice(UA_LIST) request.headers['User-Agent'] = ua
DOWNLOADER_MIDDLEWARES = { 'commit_spider.middlewares.RandomProxy': 543, 'commit_spider.middlewares.RandomUserAgent': 543}
'scrapy_redis.pipelines.RedisPipeline': 400,
爬虫代码就是使用response.xpath或者bs4解析页面了,具体如何做都是套路,网上已经很多教程了。
转载地址:http://otprn.baihongyu.com/