Python- crawl all pictures of a station

The target is http://www.5442.com/meinv/

If you need to use it on the non-linux side, please change the path symbol in the code

Holding on to the dog shit code###

#- *- coding:utf-8-*-import re
import urllib
import urllib2
import os
import chardet
import sys
'''
def get_html(url):#Get web content normally
 try:
  request = urllib2.Request(url,headers=ua_headers)
  response = urllib2.urlopen(request)
  html = response.read()return html
 except:
  print "Failed to get content"'''
def get_html(url):#Transcode to get web content
 try:
  request = urllib2.Request(url,headers=ua_headers)
  data = urllib2.urlopen(request).read()
  typeEncode = sys.getfilesystemencoding()
  infoencode = chardet.detect(data).get('encoding','gb2312')#Change the "gb2312" in this line according to the website code
  html = data.decode(infoencode,'ignore').encode(typeEncode)return html
 except:
  print "Failed to get content"
        
ua_headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36','Cookie':'AspxAutoDetectCookieSupport=1'}

url ="http://www.5442.com/meinv/"
########################################################
# First get all the tag links in the meinv page. Then use the tag name to create the corresponding directory.
########################################################
tag_code =get_html(url)
tag_egrep = r'href="(.*).*" class="'
tag_url_list = re.findall(tag_egrep, tag_code)
print
print "[V]Successfully crawled the links of all tag categories in the meinv page"
print tag_url_list##Print all links of meinv page
for tag_url in tag_url_list:try:
  tag_dir_name = tag_url[24:-5]
  # tag_mkpath ="C:\\Users\\Administrator\\Desktop\\Python-learn\\Photos\\"+ tag_dir_name
  tag_mkpath ="Photos/"+ tag_dir_name
  tag_mkdir = os.path.exists(tag_mkpath)
  print
  print "...The name of the tag has been successfully matched:"+ tag_dir_name
  if not tag_mkdir:
   os.makedirs(tag_mkpath)
   print "...create%s directory success----"%tag_dir_name
  else:
   print "...Already this%s directory----"%tag_dir_name
 except:
  print "...[X]Obtain%s link failed or created%s folder failed[X]"%tag_dir_name
        ##################################
  # Then use the tag link you got to get all tz links.
        ##################################
 try:
  tz_code =get_html(tag_url)
  tz_url_egrep = r'href="(.*).*" target="_blank" title="'
  tz_url_list = re.findall(tz_url_egrep,tz_code) 
  print tz_url_list
  for tz_url in tz_url_list:
            
   print ".........Link to current post---"+tz_url
   try:
    xz_dir = tag_mkpath +".html"
    urllib.urlretrieve(tag_url,xz_dir)
     # tz_name_egrep = r'_blank" title="(.*?)">'
    tz_name_egrep = r"<img alt='(.*?)' src"
    tz_name_list = re.findall(tz_name_egrep, tz_code)
    print tz_name_list  
    t=0
                ###############################################
    # Then use the tag link you got to get all tz names. And create the corresponding directory
                ###############################################
    for x_tz_name in tz_name_list:
     print ".........Successfully matched"+x_tz_name
     tz_mkpath = tag_mkpath +"/"+ x_tz_name
     tz_mkdir = os.path.exists(tz_mkpath)if not tz_mkdir:
      os.makedirs(tz_mkpath)
      print ".........create%s directory success"%x_tz_name
     else:
      print ".........Already%s this directory"%x_tz_name
                    ###############################################
     # Then use the tag link you got to get all tz links. And create the corresponding directory
                    ###############################################
     xx =0while True :try:
       ttz_url = tz_url_list[t]#Manually loop through each post
                            ###########################
       # Add a link to the Nth page in each post
                            ###########################
       if xx ==0:
        tz_HQ_url = ttz_url
       else:
        tz_hz_url = ttz_url[-5:]
        tz_qz_url = ttz_url[:-5]+"_"
        tz_HQ_url = tz_qz_url +str(xx)+ tz_hz_url
       print "-------------------------------------------"+tz_HQ_url
                            #######################
       # Get all the picture links of the current page
                            #######################
       img_code =get_html(tz_HQ_url)
       img_url_egrep = r"src='(.*).*' alt=''"
       img_url_list = re.findall(img_url_egrep,img_code)
       img = img_url_list[0]try:
        print "............Successfully crawled to%Links to all pictures in s"% x_tz_name
        print "............[Links to all pictures]"
        print img_url_list
        print "............%s%s picture links:%s"%(x_tz_name,xx,img)
        img_name = tag_mkpath +"/"+x_tz_name +"/"+ img[-15:]
        urllib.urlretrieve(img,img_name)#Download pictures
        print "...............The picture has been downloaded successfully:"+img_name
        print "========================================================="
        print "========================================================="
        print
       except:
        print "[X]Error downloading pictures!"
        print "========================================================="
        print "========================================================="
        print
       xx = xx +1
      except:
       print "while false"break
     t=t+2    
   except:
    print "Crawling%The image link in s failed!"%x_tz_name
            ##########################################################################
   # Determine whether the corresponding folder is created after the current loop is executed, and if there is, end the loop and directly crawl the next tag tag page
            ##########################################################################
   if os.access(str(xz_dir), os.F_OK):breakelse:
    pass  
    # The scum code is not enough for outsiders
 except:
  print "Crawling%Post failed in s"%tag_dir_name

Recommended Posts

Python- crawl all pictures of a station
What can python crawlers crawl
How to use PYTHON to crawl news articles
Python- crawl all pictures of a station
Python implements horizontal stitching of pictures
Python realizes horizontal and vertical splicing of pictures
7 features of Python3.9
Recommendations of a few websites for learning Python
A brief summary of the difference between Python2 and Python3
Python drawing | A variety of typhoon path visualization methods
How does python call the key of a dictionary
How to understand a list of numbers in python
Python generates connotative pictures
Basics of Python syntax
Basic syntax of Python
Basic knowledge of Python (1)
Prettytable module of python
09. Common modules of Python3
How to find the area of a circle in python
A large inventory of commonly used third-party libraries in Python