wordcloudで凛として時雨の歌詞を可視化してみた
凛として時雨の歌詞でよく使われる単語を可視化してみた。
歌詞データをスクレイピング -> janomeで分かち書き -> wordcloudで可視化という流れになっている。
コード
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup import urllib.request as req from janome.tokenizer import Tokenizer #jupyterでmatplotlibを使うときに記述 %matplotlib inline import matplotlib.pyplot as plt from wordcloud import WordCloud t = Tokenizer() def getInfo(url, link_list = []): res = req.urlopen(url) soup = BeautifulSoup(res, "html.parser") class_name = "side td1".strip() name_list = soup.find_all("td", class_ = class_name) for name in name_list: s = BeautifulSoup(str(name), "html.parser") link_list.append("https://www.uta-net.com" + s.find("a")["href"]) return link_list def wakati(word_list = []): link_list = getInfo("https://www.uta-net.com/artist/7840/6/") for link in link_list: res = req.urlopen(link) soup = BeautifulSoup(res, "html.parser") kashi = soup.find_all("div", id = "kashi_area") for doc in kashi: tokens = t.tokenize(doc.text) for token in tokens: if token.part_of_speech.split(',')[0] == u"名詞" or (token.part_of_speech.split(',')[0] == u"動詞" and len(token.surface) > 2): word_list.append(token.surface) return word_list def wordcloud(): word_list = wakati() words = " ".join(word_list) fpath = '/usr/share/fonts/truetype/fonts-japanese-gothic.ttf' wordcloud = WordCloud(background_color="white", font_path=fpath, width=900, height=500).generate(words) plt.figure(figsize=(15,12)) plt.imshow(wordcloud) plt.axis("off") plt.show() wordcloud.to_file('wordcloud.png') create_wordcloud()
結果