[ TOC]
(1) urllib simply crawls the specified website
(2) Scrapy crawler framework
(3) BeautifulSoup crawler analysis
1. Initial crawler
Case 1: urllib package formed by url+lib that comes with Python
#! /usr/bin/python
# Function: The first lesson of crawlers
import urllib.request #Import the specified module in the urllib package
import urllib.parse #Analytical use
# Case 1:
response = urllib.request.urlopen("http://www.weiyigeek.github.io"+urllib.parse.quote("cyber security")) #Url Chinese analysis
html = response.read() #To return a binary string
html = html.decode('utf-8') #Decoding operation
print("Writing to file.....")
f =open('weiyigeek.txt','w+',encoding='utf-8') #turn on
f.writelines(html)
f.close() #shut down
print("Results of the website request:\n",html)
# Case 2:
url ="http://placekitten.com/g/600/600"
response = urllib.request.urlopen(url) #Can be a url string or Request()Object,返回一个Object
img = response.read()
filename = url[-3:]+'.jpg'withopen(filename,'wb+')as f: #Note that binary is stored here
f.write(img)
2. Py crawler implementation/optimization
Case 1: Spider calls Youdao translation interface for Chinese-English translation
#! /usr/bin/python
# Function: Lesson 2 JSON of the crawler/proxy
import urllib.request
import urllib.parse
import json
import time
url ='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'while True:
i =input("Please enter the translated English(Type Q to exit):")if i =='Q' or i =='q':break
data ={}
data['i']= i
data['from']='AUTO'
data['to']='AUTO'
data['doctype']='json'
data['smartresult']='dict'
data['client']='fanyideskweb'
data['version']='2.1'
data['keyfrom']='fanyi.web'
data['salt']='15550362545153'
data['sign']='a28b8eb61693e30842ebbb4e0b36d406'
data['action']='FY_BY_CLICKBUTTION'
data['typoResult']='false'
data = urllib.parse.urlencode(data).encode('utf-8')
# Modify Header
# url object request and add request header information
req = urllib.request.Request(url, data) #You can also directly pass in the header object dictionary
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0')
req.add_header('Cookie',' YOUDAO_MOBILE_ACCESS_TYPE=1; [email protected]; OUTFOX_SEARCH_USER_ID_NCOO=1911553850.7151666; YOUDAO_FANYI_SELECTOR=ON; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; JSESSIONID=abc8N5HySla85aD-6kpOw; ___rl__test__cookies=1555036254514; UM_distinctid=16a0f2c1b0b146-0612adf0fe3fd6-4c312c7c-1fa400-16a0f2c1b0c659; SESSION_FROM_COOKIE=fanyiweb')
req.add_header('Referer','http://fanyi.youdao.com/')
# The object returned by the url request
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
jtarget = json.loads(html) #json parsing
print("Translated result:",jtarget['translateResult'][0][0]['tgt'])
time.sleep(1) #Delay 1s to prevent frequent requests
print("Request header information:",req.headers)print("Request URL:",res.geturl())print("status code:",res.getcode())print("Return header message:\n",res.info())
# Please enter the translated English(Type Q to exit):whoami
# Translated result:Show this user information
# Request header information:{'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0','Cookie':' YOUDAO_MOBILE_ACCESS_TYPE=1; [email protected]; OUTFOX_SEARCH_USER_ID_NCOO=1911553850.7151666; YOUDAO_FANYI_SELECTOR=ON; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; JSESSIONID=abc8N5HySla85aD-6kpOw; ___rl__test__cookies=1555036254514; UM_distinctid=16a0f2c1b0b146-0612adf0fe3fd6-4c312c7c-1fa400-16a0f2c1b0c659; SESSION_FROM_COOKIE=fanyiweb','Referer':'http://fanyi.youdao.com/'}
# Request URL: http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule
# Status code: 200
# Return header message:
# Server: Tengine
# Date: Fri,12 Apr 201903:23:02 GMT
# Content-Type: application/json;charset=utf-8
# Transfer-Encoding: chunked
# Connection: close
# Vary: Accept-Encoding
# Vary: Accept-Encoding
# Content-Language: en-US
3. Crawler parameter settings
Case 3: Use a proxy to request a website
#! /usr/bin/python3
# The third lesson of crawler: proxy general urllib uses proxy ip steps as follows
# Set proxy address
# Create Proxyhandler
# Create Opener
# Install Opener
import urllib.request
import random
url1 ='http://myip.kkcha.com/'
url2 ='http://freeapi.ipip.net/'
proxylist =['116.209.52.49:9999','218.60.8.83:3129']
ualist =['Mozilla/5.0 (compatible; MSIE 12.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',\
' Mozilla/5.0 (Windows NT 6.7; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',\
' Mozilla/5.0 (Windows NT 6.7; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'\
]
proxyip = random.choice(proxylist)
# The proxy settings parameter is a dictionary{'Types of':'Proxy IP:port'}
proxy = urllib.request.ProxyHandler({'http':proxyip})
# Create a custom opener
pro_opener = urllib.request.build_opener(proxy)
pro_opener.addheaders =[('User-Agent',random.choice(ualist))] #Random request header
# Install opener
urllib.request.install_opener(pro_opener)
## Call opener.open(url)
## Use a proxy to make a request
url2 = url2+proxyip.split(":")[0]with urllib.request.urlopen(url1)as u:print(u.headers)
res = u.read().decode('utf-8')print(res)with urllib.request.urlopen(url2)as u:
res = u.read().decode('utf-8')print(res)
3. Exception handling of crawler urllib library
#! /usr/bin/python3
# Function: urllib exception handling
from urllib.request import Request,urlopen
from urllib.error import HTTPError,URLError
urlerror ='http://www.weiyigeek.com'
urlcode ='http://www.weiyigeek.github.io/demo.html'
def url_open(url):
req =Request(url)
req.add_header('APIKEY','This is a password!')try:
res =urlopen(req)except(HTTPError,URLError)as e:ifhasattr(e,'code'): #Need to be placed in front of the reason attribute
print('HTTP request error code:', e.code)print(e.read().decode('utf-8')) #[note]Here is e.read
elif hasattr(e,'reason'):print('Server link failed',e.reason)else:print("Suceeccful!")if __name__ =='__main__':url_open(urlerror)url_open(urlcode)
################## Results of the#####################
# Server link failed[Errno 11001] getaddrinfo failed
# HTTP request error code: 404
# < html>
# < head><title>404 Not Found</title></head>
# < body>
# < center><h1>404 Not Found</h1></center>
# < hr><center>nginx/1.15.9</center>
# < /body>
# < /html>
4. Reptile regular matching
Case 4: Regular and crawler utilization
#! /usr/bin/python3
# Function: regular and crawler
from urllib.request import Request,urlopen,urlretrieve
from urllib.error import HTTPError,URLError
import re
import os
def url_open(url):
req =Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0')try:
res =urlopen(req)
html = res.read()
except HTTPError as e:print("Server request error:",e.code())return0
except URLError as e:print("Linked server Fail:",e.reason())return0else:return html
def save_img(url,dir):
i =0
os.mkdir(dir)
os.chdir(os.curdir+'/'+dir)for each in url:
# Will be discarded in the future is not recommended but it is really convenient
urlretrieve(each,str(i)+'.jpg',None)
i +=1else:print("Download completed!\a\a")
def get_img(url):
res =url_open(url).decode('utf-8')if res ==0:exit("Request error exit")
p = r'<img src="([^"]+\.jpg)"'
imglist= re.findall(p,res)save_img(imglist,'test')print(imglist)if __name__ =='__main__':
url ='http://tieba.baidu.com/f?kw=%E9%87%8D%E5%BA%86%E7%AC%AC%E4%BA%8C%E5%B8%88%E8%8C%83%E5%AD%A6%E9%99%A2&ie=utf-8&tab=album'get_img(url)
WeiyiGeek. Regular and crawler utilization
5. Advanced crawler regularization
Case 5: The crawler grabs the ip:port of the proxy website
#! /usr/bin/python3
# The last lesson of urllib crawler
import urllib.request
from urllib.error import HTTPError,URLError
import re
import os
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0')try:
res = urllib.request.urlopen(req)except(HTTPError,URLError)as e:print("An error occurred:",e.code,'Wrong page:',e.read())return0else:return res.read().decode('utf-8')
def main1(url,filename):
html =url_open(url)if html ==0:exit("Request error,Program exit!")
exp = r'<td>((?:(?:[01]{0,1}\d{0,1}\d|2[0-4]\d|25[0-5])\.){0,3}(?:[01]{0,1}\d{0,1}\d|2[0-4]\d|25[0-5]))</td>\n(?:\s*?)<td>(?P<port>\d{0,4})</td>' #This is a pit
regres = re.findall(exp,html,re.M)
iplist =[]for each in regres:
ipport = each[0]+':'+ each[1]
iplist.append(ipport)withopen(filename,'w+',encoding='utf-8')as f:for i inrange(len(iplist)):
f.write(iplist[i]+'\n')if __name__ =='__main__':
url ='https://www.xicidaili.com/nn/'main1(url,'proxyip.txt')
######### Fetch proxy results################
# 119.102.186.99:9999
# 111.177.175.234:9999
# 222.182.121.10:8118
# 110.52.235.219:9999
# 112.85.131.64:9999
1.1 Anaconda installation process
This method is a relatively simple way to install Scrapy (especially for Windows). You can use this method to install, or you can use the dedicated platform installation method below.
Anaconda is a Python distribution that contains commonly used data science libraries. If you don't have it installed, you can go to https://www.continuum.io/downloads to download and install the corresponding platform package.
If it is already installed, you can easily install Scrapy via the conda command.
The installation command is as follows:
conda install Scrapy
1.2 Windows installation process
The best way to install under WINDOS is to install through the wheel file; I am using pip3 for WIN10 environment.
# Current environment: win10+py3.7
pip3 install wheel
pip3 install lxml #Pay attention to find the corresponding version-Install lxml
pip3 install zope.interface #Install zope.interfacepip3 install Twisted
pip3 install pywin32
pip3 install Scrapy #Finally install Scrapy
# Install pyOpenSSL
# Download the wheel file from the official website, https://pypi.python.org/pypi/pyOpenSSL#downloads
pip3 install pyOpenSSL-16.2.0-py2.py3-none-any.whl
# Py3.7 One-click upgrade of all libraries
from subprocess import call
from pip._internal.utils.misc import get_installed_distributions
for dist inget_installed_distributions():call("pip install --upgrade "+ dist.project_name, shell=True)
1.3 CentOS、RedHat、Fedora
To ensure that some necessary libraries have been installed, run the following command:
sudo yum groupinstall development tools
sudo yum install python34-devel epel-release libxslt-devel libxml2-devel openssl-devel
pip3 install Scrapy
1.4 Ubuntu、Debian、Deepin
Dependent library installation First make sure that some necessary libraries have been installed, run the following command:
sudo apt-get install build-essential python3-dev libssl-dev libffi-dev libxml2 libxml2-dev libxslt1-dev zlib1g-dev
pip3 install Scrapy
1.5 Mac OS
Dependent library installation To build Scrapy's dependent library on Mac requires a C compiler and development header files, which are generally provided by Xcode. You can install it by running the following command:
xcode-select –install
pip3 install Scrapy
After verifying the installation, enter it on the command line. If a result similar to the following appears, it proves that Scrapy is installed successfully.
WeiyiGeek.scrapy
Scrapy is a Python-based crawler framework. It is an application framework written for crawling website data and extracting structural data. It can be used in data mining, information processing or storing historical data and other needs;
There are four steps to crawl a website with Scrapy:
Frame example picture:
WeiyiGeek.Scrapy
2.1 scrapy common commands
scrapy startproject douban #And initialize a project douban
scrapy genspider douban_spider movie.douban.com #After the establishment of a general crawler file is the crawling address
scrapy crawl douban_spider #Open the scrapy project for crawling,douban_spider project entry name
scrapy shell <url> #The code that executes the test to extract data in the interactive test crawler project
scrapy shell "http://scrapy.org"--nolog #Note that the print log is double quotes
scrapy crawl douban_spider -o movielist.json #Store crawled data in a specific format
scrapy crawl douban_spider -o movielist.cvs
2.2 scrapy project analysis
weiyigeek
│ items.py #Data model file,Container creation object(Serial number,name,description,Evaluation)
│ middlewares.py #Middleware settings(Crawler ip address camouflage)
│ pipelines.py #Write data through the pipeline/On disk
│ settings.py #Project settings(USER-AGENT,Crawl time)
│ __init__.py
├─spiders
│ │ douban_spider.py #Reptile project entrance
│ │ __init__.py
scrapy.cfg #Profile information
2.3 Introduction to scrapy selector
In Scrapy, selectors (selectors) based on an expression mechanism based on XPath and CSS are used. It has four basic methods:
WeiyiGeek.xpath syntax attributes
WeiyiGeek. Example
css(): Pass in a CSS expression and return the selector list of all nodes corresponding to the expression response.css('.Class name tag::method').extract() #Intercept string
extract(): serialize the node as a unicode string and return list
re(): Extract the data according to the regular expression passed in and return a list of unicode strings
2.4 scrapy interactive debugging
Description: Scrapy terminal is an interactive terminal for you to try and debug your crawling code without starting the spider;
Case:
> scrapy shell "http://movie.douban.com/chart">>>help(command)>>> request
< GET http://www.weiyigeek.github.io>> response.url
' https://movie.douban.com/chart'>>> response
<200 https://movie.douban.com/chart>>>> response.headers #Request header
>>> response.body #Web page source code
>>> response.text
>>> response.xpath('//title') #Return an xpath selector
>>> response.xpath('//title').extract() #xpath expression extract content
['< title>\n Douban Movie Ranking\n</title>']
response.xpath('//title/text()').extract() #Extract text information
['\ n Douban Movie Ranking\n']>>> response.xpath("//div[@class='pl2']//a").extract_first().strip() # extract_first extract the first matching data
'< a href="https://movie.douban.com/subject/25986662/" class="">\n crazy alien\n / <span style="font-size:13px;">Crazy Alien</span>\n </a>'
# CSS for extraction
>>> sel.css('.pl2 a::text').extract_first().strip()'Crazy alien\n /'
# How to solve the problem of extracting the header information of the request from a website:
from scrapy import Request #Import module
>>> data =Request("https://www.taobao.com",headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"})>>>fetch(data) #Get the requested website
2017- 11- 3022:24:14[ scrapy.core.engine] DEBUG:Crawled(200)<GET https://www.taobao.com>(referer: None)>>> sel.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div/ul/li[1]/a[1]')[<Selector xpath='/html/body/div[4]/div[1]/div[1]/div[1]/div/ul/li[1]/a[1]' data='<a href="https://www.taobao.com/markets/'>]>>> data.headers #View the set header
{ b'User-Agent':[b'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'], b'Accept-Encoding':[b'gzip,deflate']}
# Match multiple strings for iteration loop
>>> fetch('http://weiyigeek.github.io')>>> title = response.css('.article-header a::text').extract()>>>for each in title:...print(each)...
Summary of Security Device Policy Bypass Technology.md
Win platform security configuration.md
Python3 regular expression special symbols and usage.md
Python3 crawler learning.md
Disk high-availability solution(DBA).md
Nodejs introductory learning 1.md
Node.js introduction and installation.md
Domain Control Security Fundamentals.md
Win intranet penetration information search.md
Highly available service solutions(DBA).md
WeiyiGeek.scrapyshell
2.4 Simple example of scrapy
scrapy startproject weiyigeek
scrapy genspider blog_spider www.weiyigeek.github.io
'''
items.Edit the data model file of the object captured by py
'''
import scrapy
classWeiyigeekItem(scrapy.Item):
# items.py set the object to be grabbed edit the data model file,Create object(Serial number,name,description,Evaluation)
title = scrapy.Field() #title
href = scrapy.Field() #Title address
time = scrapy.Field() #Creation time
'''
blog_spider.py crawler processes the main file
'''
# - *- coding: utf-8-*-import scrapy
from weiyigeek.items import WeiyigeekItem #Import attributes in the class in the data container(Actually import the items in the project.py)classBlogSpiderSpider(scrapy.Spider):
name ='blog_spider' #Crawler name
allowed_domains =['www.weiyigeek.github.io'] #Domains allowed by the crawler
start_urls =['http://www.weiyigeek.github.io/','http://weiyigeek.github.io/page/2/'] #Crawler crawling data address,To the scheduler
# Parse the web page object returned by the request
def parse(self, response):
sel = scrapy.selector.Selector(response) #scrapy selector
sites = sel.css('.article-header') #Use css selector to select
items =[]for each in sites:
item =WeiyigeekItem() #Data container class
item['title']= each.xpath('a/text()').extract()
item['href']= each.xpath('a/@href').extract()
item['time']= each.xpath('div[@class="article-meta"]/time/text()').extract() #Note the use here
items.append(item)
# Output to the screen
print(">>>",item['title'],item['href'],item['time'])return items
WeiyiGeek. Execution results
Description: Crawl iQiyi’s TOPS250 project;
# Step1.Create spider project and initialize crawler name
scrapy startproject douban
scrapy genspider douban_spider movie.douban.com
'''
Step2.Modify the items template file
'''
classDoubanItem(scrapy.Item):
serial_number = scrapy.Field() #Serial number
movie_name = scrapy.Field() #Movie title
introduce = scrapy.Field() #Introduction
star = scrapy.Field() #Star rating
evaluate = scrapy.Field() #Evaluation
describle = scrapy.Field() #description
'''
Step3.Modify the crawler file
'''
# - *- coding: utf-8-*-import scrapy
from douban.items import DoubanItem #Import container douban\items.py
classDoubanSpiderSpider(scrapy.Spider):
name ='douban_spider' #The name of the crawler
allowed_domains =['movie.douban.com'] #Domains allowed by the crawler
start_urls =['https://movie.douban.com/top250'] #Crawler crawling data address,To the scheduler
def parse(self, response):
movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li")for i_item in movie_list:
douban_item =DoubanItem() #Model initialization
# Take text()End means to get its information,extract_first()Filter the first value of the result
douban_item['serial_number']= i_item.xpath(".//div[@class='item']//em/text()").extract_first() #Rank
douban_item['movie_name']= i_item.xpath(".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first() #name
descs = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract_first() #
# Dealing with space issues
desc_str =''for i_desc in descs:
i_desc_str ="".join(i_desc.split())
desc_str += i_desc_str
douban_item['introduce']= desc_str #Introduction
douban_item['star']= i_item.xpath(".//span[@class='rating_num']/text()").extract_first() #star
douban_item['evaluate']= i_item.xpath(".//div[@class='star']//span[4]/text()").extract_first() #Number of evaluations
douban_item['describle']= i_item.xpath(".//p[@class='quote']/span/text()").extract_first() #description
yield douban_item #Press the returned result into item Pipline for processing (emphasis)
# Process next page function
next_link = response.xpath("//div[@class='article']//span[@class='next']/link/@href").extract()if next_link:
next_link = next_link[0]yield scrapy.Request("https://movie.douban.com/top250"+next_link,callback=self.parse) #(Emphasis)
# Explanation:
# 1 After each for loop ends,Need to get the next page link:next_link
# 2 If there is no next page when the last page is reached,Need to judge
# 3 Next address stitching:When you click on the second page, the page address is https://movie.douban.com/top250?start=25&filter=
# 4 callback=self.parse :Request callback
'''
Step4.Modify the configuration file
'''
$ grep -E -v "^#" settings.py
BOT_NAME ='douban' #project name
SPIDER_MODULES =['douban.spiders']
NEWSPIDER_MODULE ='douban.spiders'
USER_AGENT =' Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY =0.5
# Channel settings
ITEM_PIPELINES ={'douban.pipelines.DoubanPipeline':300,}
# Download middleware settings call
DOWNLOADER_MIDDLEWARES ={'douban.middlewares.my_proxy':543,'douban.middlewares.my_useragent':544,}
# Set up mongo_db database information
mongo_host ='172.16.0.0'
mongo_port =27017
mongo_db_name ='douban'
mongo_db_collection ='douban_movie''''
Step5.Modify pipelines.py
'''
# - *- coding: utf-8-*-import pymongo
from douban.settings import mongo_host ,mongo_port,mongo_db_name,mongo_db_collection
classDoubanPipeline(object):
def __init__(self):
host = mongo_host
port = mongo_port
dbname = mongo_db_name
sheetname = mongo_db_collection
client = pymongo.MongoClient(host=host,port=port)
mydb = client[dbname]
self.post = mydb[sheetname]
def process_item(self, item, spider):
data =dict(item)
self.post.insert(data)return item
'''
Step6.Mid-price document:middlewares.py
'''
# ip proxy intermediate price compilation(Crawler ip address camouflage)/Header User-Agent disguise randomly
import base64
import random
# Adding method to the end of the file:classmy_proxy(object): #proxy
def process_request(self,request,spider):
request.meta['proxy']='http-cla.abuyun.com:9030'
proxy_name_pass = b'H622272STYB666BW:F78990HJSS7'
enconde_pass_name = base64.b64encode(proxy_name_pass)
request.headers['Proxy-Authorization']='Basic '+ enconde_pass_name.decode()
# Explanation:Purchase http tunnel list information according to Abu Cloud registration
# request.meta['proxy']:'server address:The port number'
# proxy_name_pass: b'Certificate No:Key',The beginning of b is the string base64 processing
# base64.b64encode():Variables do base64 processing
# ' Basic ':There must be a space after basic
classmy_useragent(object): # userAgent
def process_request(self, request, spider):
UserAgentList =["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)","Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)","Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)","Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6","Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0","Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5","Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20","Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)","Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1","Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre","Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11","Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",]
agent = random.choice(UserAgentList)
request.headers['User_Agent']= agent
## Run scrapy crawl and you will see the middle key method set above
You can also save the data to a json file or csv file
WeiyiGeek. Question 1
Solution: [Official website download twisted whl package installation] (https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted)
Twisted‑19.2.0‑cp37‑cp37m‑win_amd64.whl
#! /usr/bin/env python3
# - *- coding: utf-8-*-
# Function: School area analysis
import urllib.request
import urllib.parse
from lxml import etree
number =[]
name =[]
file1 =open('2.txt','r',encoding='utf-8')for i in file1:
keyvalue = i.split(" ")
number.append(str(keyvalue[0]))
name.append(str(keyvalue[1]))
file1.close()
def test1(html,parm=""):
dom_tree = etree.HTML(html)
links = dom_tree.xpath("//div[@class='basic-info cmn-clearfix']/dl/dd/text()")for i in links:for v inrange(0,len(number)):if(i.find(name[v][:2])!=-1):return number[v]+ name[v]+parm+"\n"return"Not found(Or overseas)"
file =open('1.txt','r+',encoding='utf-8')
file.seek(0,0)for eachline in file:
url ="https://baike.baidu.com/item/"+urllib.parse.quote(eachline[6:]);
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8') #Decoding operation
f =open('c:\\weiyigeek.txt','a+',encoding='utf-8') #turn on
res =test1(html,str(eachline[6:]))
f.writelines(res)
f.close() #shut down
file.close()
WeiyiGeek. Post-execution effect
Recommended Posts