The target is http://www.5442.com/meinv/
If you need to use it on the non-linux side, please change the path symbol in the code
#- *- coding:utf-8-*-import re
import urllib
import urllib2
import os
import chardet
import sys
'''
def get_html(url):#Get web content normally
try:
request = urllib2.Request(url,headers=ua_headers)
response = urllib2.urlopen(request)
html = response.read()return html
except:
print "Failed to get content"'''
def get_html(url):#Transcode to get web content
try:
request = urllib2.Request(url,headers=ua_headers)
data = urllib2.urlopen(request).read()
typeEncode = sys.getfilesystemencoding()
infoencode = chardet.detect(data).get('encoding','gb2312')#Change the "gb2312" in this line according to the website code
html = data.decode(infoencode,'ignore').encode(typeEncode)return html
except:
print "Failed to get content"
ua_headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36','Cookie':'AspxAutoDetectCookieSupport=1'}
url ="http://www.5442.com/meinv/"
########################################################
# First get all the tag links in the meinv page. Then use the tag name to create the corresponding directory.
########################################################
tag_code =get_html(url)
tag_egrep = r'href="(.*).*" class="'
tag_url_list = re.findall(tag_egrep, tag_code)
print
print "[V]Successfully crawled the links of all tag categories in the meinv page"
print tag_url_list##Print all links of meinv page
for tag_url in tag_url_list:try:
tag_dir_name = tag_url[24:-5]
# tag_mkpath ="C:\\Users\\Administrator\\Desktop\\Python-learn\\Photos\\"+ tag_dir_name
tag_mkpath ="Photos/"+ tag_dir_name
tag_mkdir = os.path.exists(tag_mkpath)
print
print "...The name of the tag has been successfully matched:"+ tag_dir_name
if not tag_mkdir:
os.makedirs(tag_mkpath)
print "...create%s directory success----"%tag_dir_name
else:
print "...Already this%s directory----"%tag_dir_name
except:
print "...[X]Obtain%s link failed or created%s folder failed[X]"%tag_dir_name
##################################
# Then use the tag link you got to get all tz links.
##################################
try:
tz_code =get_html(tag_url)
tz_url_egrep = r'href="(.*).*" target="_blank" title="'
tz_url_list = re.findall(tz_url_egrep,tz_code)
print tz_url_list
for tz_url in tz_url_list:
print ".........Link to current post---"+tz_url
try:
xz_dir = tag_mkpath +".html"
urllib.urlretrieve(tag_url,xz_dir)
# tz_name_egrep = r'_blank" title="(.*?)">'
tz_name_egrep = r"<img alt='(.*?)' src"
tz_name_list = re.findall(tz_name_egrep, tz_code)
print tz_name_list
t=0
###############################################
# Then use the tag link you got to get all tz names. And create the corresponding directory
###############################################
for x_tz_name in tz_name_list:
print ".........Successfully matched"+x_tz_name
tz_mkpath = tag_mkpath +"/"+ x_tz_name
tz_mkdir = os.path.exists(tz_mkpath)if not tz_mkdir:
os.makedirs(tz_mkpath)
print ".........create%s directory success"%x_tz_name
else:
print ".........Already%s this directory"%x_tz_name
###############################################
# Then use the tag link you got to get all tz links. And create the corresponding directory
###############################################
xx =0while True :try:
ttz_url = tz_url_list[t]#Manually loop through each post
###########################
# Add a link to the Nth page in each post
###########################
if xx ==0:
tz_HQ_url = ttz_url
else:
tz_hz_url = ttz_url[-5:]
tz_qz_url = ttz_url[:-5]+"_"
tz_HQ_url = tz_qz_url +str(xx)+ tz_hz_url
print "-------------------------------------------"+tz_HQ_url
#######################
# Get all the picture links of the current page
#######################
img_code =get_html(tz_HQ_url)
img_url_egrep = r"src='(.*).*' alt=''"
img_url_list = re.findall(img_url_egrep,img_code)
img = img_url_list[0]try:
print "............Successfully crawled to%Links to all pictures in s"% x_tz_name
print "............[Links to all pictures]"
print img_url_list
print "............%s%s picture links:%s"%(x_tz_name,xx,img)
img_name = tag_mkpath +"/"+x_tz_name +"/"+ img[-15:]
urllib.urlretrieve(img,img_name)#Download pictures
print "...............The picture has been downloaded successfully:"+img_name
print "========================================================="
print "========================================================="
print
except:
print "[X]Error downloading pictures!"
print "========================================================="
print "========================================================="
print
xx = xx +1
except:
print "while false"break
t=t+2
except:
print "Crawling%The image link in s failed!"%x_tz_name
##########################################################################
# Determine whether the corresponding folder is created after the current loop is executed, and if there is, end the loop and directly crawl the next tag tag page
##########################################################################
if os.access(str(xz_dir), os.F_OK):breakelse:
pass
# The scum code is not enough for outsiders
except:
print "Crawling%Post failed in s"%tag_dir_name
Recommended Posts