Python3クローラーデータクリーニング分析

[ TOC]

0 x00クイックスタート####

0 x01分析ブログの抽出####

説明：自分のブログのタグクラウドを作成しました。これは、クロールとデータクリーニングの学習に非常に役立ちます。

ワードクラウドを生成するには、いくつかのライブラリを使用する必要があります。
pip install numoy matplotlib wordcloud Pillow jieba

実際のケース：


#! /usr/bin/env python
# - *- coding: utf-8-*-
# @ File : blogWordCloud.py
# @ CreateTime :2019/7/1214:52
# @ Author : WeiyiGeek
# @ Function :ブログワードクラウド画像生成を実現
# @ Software: PyCharm

import requests
import jieba
import numpy as np
import matplotlib.pyplot as plt
from lxml import etree
from PIL import Image
from wordcloud import WordCloud

# タイトルリスト
titlelist =[]
# ワードクラウドシェイプ
wc_mask_img ='bg.jpg'
# ワードクラウドフォント
WC_FONT_PATH ='simhei.ttf'

def get(url):try:
  r = requests.get(url)
 except ConnectionError as e:print("[*] Error = "+str(e))exit(0)
 except TimeoutError as e:print("[*] Time = "+str(e))exit(1)
 except Exception as e:print("[*] Other Error = "+str(e))exit(2)
 # r.raise_for_status()  #上記の例外と同等

 print("URL:",r.url)
 r.encoding ="utf-8"  #出力コンテンツutf8エンコーディング
 r.close()  #リクエストを閉じるリクエストオブジェクト

 # WebページHTMを解析して、xpath抽出を容易にします
 dom_tree = etree.HTML(r.content)
 # 記事のタイトルを抽出する
 title = dom_tree.xpath("//div/span[@class='archive-title']/a/text()")
 # タイトルをリストにトラバースします
 for i in title:if(i =="無題"):continueif".md"in i:
   i = i.split(".md")[0]
  titlelist.append(i)

def word():
 # グローバル
 global titlelist
 titlestring =""
 # ヘッドラインステッチ
 for title in titlelist:
  titlestring += title +" "

 # データをセグメント化する
 wordlist = jieba.cut(titlestring,cut_all=True)
 # wl =" ".join(wordlist)
 # pprint(wl)

 # 単語の重複を排除して削除する
 titlelist =[]for word in wordlist:if word not in titlelist and len(word)!=1:
   titlelist.append(word)return" ".join(titlelist)

def imgcloud():"""
 ワードクラウドを生成する
 : return:"""
 # ワードクラウドシェイプ画像を設定
 wc_mask = np.array(Image.open(wc_mask_img))
 wc =WordCloud(background_color="white",max_words=2000, scale=4,max_font_size=70,mask=wc_mask,random_state=42,font_path=WC_FONT_PATH)
 # ワードクラウドを生成する
 wc.generate(word())
    
 # マスクのみを設定する場合,あなたは絵の形をしたワードクラウドを手に入れるでしょう
 plt.imshow(wc, interpolation="bilinear")
 plt.axis("off")
 plt.figure()
 fig = plt.gcf()
 fig.savefig("./blogWordCloud.png") #保存はショーの前でなければならないことに注意してください
 plt.show()if __name__ =='__main__':
 url ="http://127.0.0.1:4000/archives/"get(url)imgcloud()

WeiyiGeek。ブログワードクラウド