[ TOC]
Description: I have written a tag cloud of my own blog, which is very useful for learning crawling and data cleaning;
We need to use several libraries to generate word cloud:
pip install numoy matplotlib wordcloud Pillow jieba
actual case:
#! /usr/bin/env python
# - *- coding: utf-8-*-
# @ File : blogWordCloud.py
# @ CreateTime :2019/7/1214:52
# @ Author : WeiyiGeek
# @ Function :Realize blog word cloud image generation
# @ Software: PyCharm
import requests
import jieba
import numpy as np
import matplotlib.pyplot as plt
from lxml import etree
from PIL import Image
from wordcloud import WordCloud
# Title list
titlelist =[]
# Word cloud shape
wc_mask_img ='bg.jpg'
# Word cloud font
WC_FONT_PATH ='simhei.ttf'
def get(url):try:
r = requests.get(url)
except ConnectionError as e:print("[*] Error = "+str(e))exit(0)
except TimeoutError as e:print("[*] Time = "+str(e))exit(1)
except Exception as e:print("[*] Other Error = "+str(e))exit(2)
# r.raise_for_status() #Equivalent to the above exception
print("URL:",r.url)
r.encoding ="utf-8" #Output content utf8 encoding
r.close() #Close requests request object
# Parse webpage HTM to facilitate xpath extraction
dom_tree = etree.HTML(r.content)
# Extract article title
title = dom_tree.xpath("//div/span[@class='archive-title']/a/text()")
# Traverse the title into the list
for i in title:if(i =="Untitled"):continueif".md"in i:
i = i.split(".md")[0]
titlelist.append(i)
def word():
# Global
global titlelist
titlestring =""
# Headline stitching
for title in titlelist:
titlestring += title +" "
# Segment the data
wordlist = jieba.cut(titlestring,cut_all=True)
# wl =" ".join(wordlist)
# pprint(wl)
# De-duplicate and eliminate a word
titlelist =[]for word in wordlist:if word not in titlelist and len(word)!=1:
titlelist.append(word)return" ".join(titlelist)
def imgcloud():"""
Generate word cloud
: return:"""
# Set word cloud shape picture
wc_mask = np.array(Image.open(wc_mask_img))
wc =WordCloud(background_color="white",max_words=2000, scale=4,max_font_size=70,mask=wc_mask,random_state=42,font_path=WC_FONT_PATH)
# Generate word cloud
wc.generate(word())
# In the case of only setting the mask,You will get a word cloud in the shape of a picture
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.figure()
fig = plt.gcf()
fig.savefig("./blogWordCloud.png") #Note that the save should be before the show
plt.show()if __name__ =='__main__':
url ="http://127.0.0.1:4000/archives/"get(url)imgcloud()
WeiyiGeek. Blog word cloud
Recommended Posts