Python3 crawler data cleaning analysis

[ TOC]

0 x00 Quick Start####

0 x01 Analysis Blog Extraction####

Description: I have written a tag cloud of my own blog, which is very useful for learning crawling and data cleaning;

We need to use several libraries to generate word cloud:
pip install numoy matplotlib wordcloud Pillow jieba

actual case:


#! /usr/bin/env python
# - *- coding: utf-8-*-
# @ File : blogWordCloud.py
# @ CreateTime :2019/7/1214:52
# @ Author : WeiyiGeek
# @ Function :Realize blog word cloud image generation
# @ Software: PyCharm

import requests
import jieba
import numpy as np
import matplotlib.pyplot as plt
from lxml import etree
from PIL import Image
from wordcloud import WordCloud

# Title list
titlelist =[]
# Word cloud shape
wc_mask_img ='bg.jpg'
# Word cloud font
WC_FONT_PATH ='simhei.ttf'

def get(url):try:
  r = requests.get(url)
 except ConnectionError as e:print("[*] Error = "+str(e))exit(0)
 except TimeoutError as e:print("[*] Time = "+str(e))exit(1)
 except Exception as e:print("[*] Other Error = "+str(e))exit(2)
 # r.raise_for_status()  #Equivalent to the above exception

 print("URL:",r.url)
 r.encoding ="utf-8"  #Output content utf8 encoding
 r.close()  #Close requests request object

 # Parse webpage HTM to facilitate xpath extraction
 dom_tree = etree.HTML(r.content)
 # Extract article title
 title = dom_tree.xpath("//div/span[@class='archive-title']/a/text()")
 # Traverse the title into the list
 for i in title:if(i =="Untitled"):continueif".md"in i:
   i = i.split(".md")[0]
  titlelist.append(i)

def word():
 # Global
 global titlelist
 titlestring =""
 # Headline stitching
 for title in titlelist:
  titlestring += title +" "

 # Segment the data
 wordlist = jieba.cut(titlestring,cut_all=True)
 # wl =" ".join(wordlist)
 # pprint(wl)

 # De-duplicate and eliminate a word
 titlelist =[]for word in wordlist:if word not in titlelist and len(word)!=1:
   titlelist.append(word)return" ".join(titlelist)

def imgcloud():"""
 Generate word cloud
 : return:"""
 # Set word cloud shape picture
 wc_mask = np.array(Image.open(wc_mask_img))
 wc =WordCloud(background_color="white",max_words=2000, scale=4,max_font_size=70,mask=wc_mask,random_state=42,font_path=WC_FONT_PATH)
 # Generate word cloud
 wc.generate(word())
    
 # In the case of only setting the mask,You will get a word cloud in the shape of a picture
 plt.imshow(wc, interpolation="bilinear")
 plt.axis("off")
 plt.figure()
 fig = plt.gcf()
 fig.savefig("./blogWordCloud.png") #Note that the save should be before the show
 plt.show()if __name__ =='__main__':
 url ="http://127.0.0.1:4000/archives/"get(url)imgcloud()

WeiyiGeek. Blog word cloud