TensorFlowのk平均法で文章をクラスタリングしてみた

TensorFlowでスポーツニュースと経済ニュースの文書をクラスタリングしてみたので、その時の手順をまとめてみたいと思います。

今回の大まかな流れとして以下のように進めたいと思います。
- 経済とスポーツのニュースをテキストで保存しておく
- Pythonでファイルを読み込む
- 読み込んだ文章を形態素解析し文書ごとで使われている単語の辞書を作成する
- 全ての文書で指定回数内使われている単語で辞書を作る
- ↑で作成した辞書を使って文書毎でベクトル化してk平均法の入力データを作成する
- TensorFlowでk平均法を実行する
- 文章のクラスタリング結果を表示する

文章をベクトル化

k平均法で文章をクラスタリングできるようにするためには文章をベクトルで表現できるようにする必要があります。
今回は文書ないに特定の単語が含まれていたら、該当する要素を0にし含まれていなければ1にするといったものにしてみます。
実装は以下のようになりました。

import numpy as np    
import tensorflow as tf    
from janome.tokenizer import Tokenizer    
from janome.tokenizer import Tokenizer    
from janome.analyzer import Analyzer    
from janome.charfilter import *    
from janome.tokenfilter import *    
    
def wordToDicGen(tokenizer, char_filters, token_filters):    
    def wordToDic(text):    
        dic = dict()    
        for token in Analyzer(char_filters, tokenizer, token_filters).analyze(text):    
            dic[token.base_form] = 1    
        return dic    
    return wordToDic    
    
def keyWithIndex(dic: dict):    
    retdic = dict()    
    i = 0    
    for key in sorted(dic.keys()):    
        retdic[key] = i    
        i += 1    
    return retdic    
    
def dicToVec(index_dic, target_dic):    
    resultVec = np.zeros(len(index_dic))    
    for key in sorted(target_dic):    
        if key in index_dic:    
            resultVec[index_dic[key]] = 1.0    
    return resultVec    
    
    
def mergeCountDic(someDic, newDic):    
    for key in newDic:    
        if key in someDic:    
            someDic[key] = someDic[key] + 1    
        else:    
            someDic[key] = 1    
    return someDic    
    
def dicCountFilter(min, max,  dic):    
    resultDic = dict()    
    for key in dic:    
        if dic[key] >= min and dic[key] < max:    
            resultDic[key] = 1    
    return resultDic    
    
def textToVec(textList: list, wordToDic):    
    someDic = dict()    
    textDicList = list()    
    for text in textList:    
        textDic = wordToDic(text)    
        someDic = mergeCountDic(someDic, textDic)    
        textDicList.append(textDic)    
    indexDic = keyWithIndex(dicCountFilter(4, 8, someDic))    
    return indexDic, np.vstack((list(map(lambda textDic: dicToVec(indexDic, textDic), textDicList))))    
    
    
char_filters = [UnicodeNormalizeCharFilter()]    
tokenizer = Tokenizer()    
token_filters = [CompoundNounFilter(), POSStopFilter(['記号','助詞', '助動詞', '助動詞']), LowerCaseFilter()]    
wordToDic = wordToDicGen(tokenizer, char_filters, token_filters)    
    
    
keizaiNews = list()    
for i in range(10):    
    keizaiNews.append(open("news/keizai_"  + '{:02}'.format(i+1) + ".txt", "r").read())    
    
spoNews = list()    
for i in range(10):    
    spoNews.append(open("news/spo_"  + '{:02}'.format(i+1) + ".txt", "r").read())    
    
    
someList = list()    
someList.extend(keizaiNews)    
someList.extend(spoNews)    
    
indexDic, textVec = textToVec(someList, wordToDic)

indexDicは文書全体の辞書になっており、結果は以下のようになりました。

{'いう': 0,    
 'いく': 1,    
 'おる': 2,    
 'かける': 3,    
 'この': 4,    
 'せる': 5,    
 'そして': 6,    
 'その': 7,    
 'できる': 8,    
 'もの': 9,    
 'よう': 10,    
 'られる': 11,    
 'わけ': 12,    
 'チーム': 13,    
 '一方': 14,    
 '中': 15,    
 '今': 16,    
 '出る': 17,    
 '出場': 18,    
 '受ける': 19,    
 '同じ': 20,    
 '多く': 21,    
 '大会': 22,    
 '強い': 23,    
 '思う': 24,    
 '日本': 25,    
 '決める': 26,    
 '決勝トーナメント': 27,    
 '発表': 28,    
 '第': 29,    
 '結果': 30,    
 '行う': 31,    
 '試合': 32,    
 '語る': 33,    
 '選手': 34,    
 '開く': 35,    
 '開幕': 36}

それからtextVecには文書毎でベクトル化した結果が入っていてtextVec[11]は以下の結果になりました。

array([1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,    
       1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1.,    
       1., 0., 0.])

これより全体の辞書と見比べて11個目の文章には"いう","いく","かける",,,"試合","語る","選手"の単語が含まれておりスポーツのニュースだと予想がつきます。

全体の辞書の件数が37件と少なくなっていますが、これは以下の関数でよく出る単語とあまり出る単語はクラスタリングに影響しないようにfilterしているためになります。

def dicCountFilter(min, max,  dic):    
    resultDic = dict()    
    for key in dic:    
        if dic[key] >= min and dic[key] < max:    
            resultDic[key] = 1    
    return resultDic

ベクトル化した文章にk平均法を実行

TensorFlowでのk平均法は以下のような実装になりました。

def input_fn_gen(textVec):    
    def input_fn():    
        return tf.train.limit_epochs(    
            tf.convert_to_tensor(textVec, dtype=tf.float32), num_epochs=30)    
    return input_fn    
    
def train_model(num_clusters, input_fn):    
    kmeans = tf.contrib.factorization.KMeansClustering(num_clusters=num_clusters, use_mini_batch=False)    
    # train    
    num_iterations = 10    
    previous_centers = None    
    for _ in range(num_iterations):    
        kmeans.train(input_fn)    
        cluster_centers = kmeans.cluster_centers()    
        previous_centers = cluster_centers    
    return kmeans, cluster_centers    
    
def showResult(kmeans, cluster_centers, input_fn, textVec):    
    # map the input points to their clusters    
    cluster_indices = list(kmeans.predict_cluster_index(input_fn))    
    for i, point in enumerate(textVec):    
        cluster_index = cluster_indices[i]    
        print('index:', i, 'is in cluster', cluster_index)    
    
input_fn = input_fn_gen(textVec)    
kmeans, cluster_centers =  train_model(2, input_fn)    
showResult(kmeans, cluster_centers, input_fn, textVec)

分類結果を表示するshowResultの実行結果は以下のようになりました。

index: 0 is in cluster 0    
index: 1 is in cluster 0    
index: 2 is in cluster 0    
index: 3 is in cluster 0    
index: 4 is in cluster 0    
index: 5 is in cluster 0    
index: 6 is in cluster 0    
index: 7 is in cluster 0    
index: 8 is in cluster 0    
index: 9 is in cluster 0    
index: 10 is in cluster 1    
index: 11 is in cluster 1    
index: 12 is in cluster 0    
index: 13 is in cluster 0    
index: 14 is in cluster 1    
index: 15 is in cluster 0    
index: 16 is in cluster 0    
index: 17 is in cluster 1    
index: 18 is in cluster 1    
index: 19 is in cluster 0

文書の0 ~ 9が経済のニュースで10 ~ 19がスポーツニュースなのですが、経済は全て0に判定されスポーツは0の判定が5で1の判定が5となり、誤文類が多い結果となりました。
これについて全体の辞書の結果を確認してみるとスポーツ系の単語と一般的な単語で別れているような感じに見え、スポーツのニュースなのに辞書にスポーツ系の単語が含まれていない場合は0の方で判定されているような気がします。試しに誤文類されている文書の12を確認してみます。

textVec[12]    
array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,    
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0.,    
       0., 0., 0.])

4, 14, 25, 29, 30, 31番目の単語が含まれているので、かける, 一方, 日本, 第, 結果, 行うが含まれているようですが、確かにこれではスポーツニュースに分類するのは難しそうに思います。
これらを踏まえるとスポーツならスポーツの局所情報、経済なら経済の局所情報を上手く取得して辞書が作れれば精度上がりそうな気がしました。stopワードを登録してfilterをゆるくするとかもあるのでしょうか。またはword2Vecで単語自体の意味が先にある程度わかっていたら精度が上がる気がします。

今回の実装まとめると以下のようになりました。動作確認する場合文書は自前で保存しておく必要があります。

import numpy as np    
import tensorflow as tf    
from janome.tokenizer import Tokenizer    
from janome.tokenizer import Tokenizer    
from janome.analyzer import Analyzer    
from janome.charfilter import *    
from janome.tokenfilter import *    
    
def wordToDicGen(tokenizer, char_filters, token_filters):    
    def wordToDic(text):    
        dic = dict()    
        for token in Analyzer(char_filters, tokenizer, token_filters).analyze(text):    
            dic[token.base_form] = 1    
        return dic    
    return wordToDic    
    
def keyWithIndex(dic: dict):    
    retdic = dict()    
    i = 0    
    for key in sorted(dic.keys()):    
        retdic[key] = i    
        i += 1    
    return retdic    
    
def dicToVec(index_dic, target_dic):    
    resultVec = np.zeros(len(index_dic))    
    for key in sorted(target_dic):    
        if key in index_dic:    
            resultVec[index_dic[key]] = 1.0    
    return resultVec    
    
    
def mergeCountDic(someDic, newDic):    
    for key in newDic:    
        if key in someDic:    
            someDic[key] = someDic[key] + 1    
        else:    
            someDic[key] = 1    
    return someDic    
    
def dicCountFilter(min, max,  dic):    
    resultDic = dict()    
    for key in dic:    
        if dic[key] >= min and dic[key] < max:    
            resultDic[key] = 1    
    return resultDic    
    
def textToVec(textList: list, wordToDic):    
    someDic = dict()    
    textDicList = list()    
    for text in textList:    
        textDic = wordToDic(text)    
        someDic = mergeCountDic(someDic, textDic)    
        textDicList.append(textDic)    
    indexDic = keyWithIndex(dicCountFilter(4, 8, someDic))    
    return indexDic, np.vstack((list(map(lambda textDic: dicToVec(indexDic, textDic), textDicList))))    
    
    
char_filters = [UnicodeNormalizeCharFilter()]    
tokenizer = Tokenizer()    
token_filters = [CompoundNounFilter(), POSStopFilter(['記号','助詞', '助動詞', '助動詞']), LowerCaseFilter()]    
wordToDic = wordToDicGen(tokenizer, char_filters, token_filters)    
    
    
keizaiNews = list()    
for i in range(10):    
    keizaiNews.append(open("news/keizai_"  + '{:02}'.format(i+1) + ".txt", "r").read())    
    
spoNews = list()    
for i in range(10):    
    spoNews.append(open("news/spo_"  + '{:02}'.format(i+1) + ".txt", "r").read())    
    
    
def input_fn_gen(textVec):    
    def input_fn():    
        return tf.train.limit_epochs(    
            tf.convert_to_tensor(textVec, dtype=tf.float32), num_epochs=30)    
    return input_fn    
    
def train_model(num_clusters, input_fn):    
    kmeans = tf.contrib.factorization.KMeansClustering(num_clusters=num_clusters, use_mini_batch=False)    
    # train    
    num_iterations = 10    
    previous_centers = None    
    for _ in range(num_iterations):    
        kmeans.train(input_fn)    
        cluster_centers = kmeans.cluster_centers()    
        previous_centers = cluster_centers    
    return kmeans, cluster_centers    
    
def showResult(kmeans, cluster_centers, input_fn, textVec):    
    # map the input points to their clusters    
    cluster_indices = list(kmeans.predict_cluster_index(input_fn))    
    for i, point in enumerate(textVec):    
        cluster_index = cluster_indices[i]    
        print('index:', i, 'is in cluster', cluster_index)    
    
    
someList = list()    
someList.extend(keizaiNews)    
someList.extend(spoNews)    
    
indexDic, textVec = textToVec(someList, wordToDic)    
input_fn = input_fn_gen(textVec)    
kmeans, cluster_centers =  train_model(2, input_fn)    
showResult(kmeans, cluster_centers, input_fn, textVec)

idfを使ってフィルターする単語を選ぶ

idfは文書に稀に現れる単語ほど評価するものになっており、以下の式で表すことができます。
f:id:steavevaivai:20180627071656p:plain
|D|は文書数で、d ∋ t_iは全文書中に単語t_iが現れる文書数になります。これよりidfは文書に稀に現れるほど大きな値になることがわかります。これを使うようにする場合、単語をフィルターする処理は以下のような実装になりました。

def dicIdfFilter(dic, doc_num):
    resultDic = dict()
    calCache = dict()
    for key in dic:
        if dic[key] not in calCache:
            calCache[dic[key]] = math.log(float(doc_num / dic[key]), math.e)
        if dic[key] >= 2 and calCache[dic[key]] >= 1.5:
            resultDic[key] = calCache[dic[key]]
    return resultDic

def textToVec(textList: list, wordToDic):
    someDic = dict()
    textDicList = list()
    for text in textList:
        textDic = wordToDic(text)
        someDic = mergeCountDic(someDic, textDic)
        textDicList.append(textDic)
    indexDic = keyWithIndex(dicIdfFilter(someDic, len(textList)))
    return indexDic, np.vstack((list(map(lambda textDic: dicToVec(indexDic, textDic), textDicList))))

これでエンタメ、経済、スポーツのニュース記事20件ずつ分類すると以下のような結果になりました。

index: 0 is in cluster 0
index: 1 is in cluster 2
index: 2 is in cluster 0
index: 3 is in cluster 1
index: 4 is in cluster 2
index: 5 is in cluster 0
index: 6 is in cluster 0
index: 7 is in cluster 0
index: 8 is in cluster 0
index: 9 is in cluster 0
index: 10 is in cluster 0
index: 11 is in cluster 0
index: 12 is in cluster 0
index: 13 is in cluster 0
index: 14 is in cluster 0
index: 15 is in cluster 0
index: 16 is in cluster 0
index: 17 is in cluster 2
index: 18 is in cluster 0
index: 19 is in cluster 0
index: 20 is in cluster 2
index: 21 is in cluster 2
index: 22 is in cluster 2
index: 23 is in cluster 2
index: 24 is in cluster 2
index: 25 is in cluster 2
index: 26 is in cluster 2
index: 27 is in cluster 2
index: 28 is in cluster 2
index: 29 is in cluster 2
index: 30 is in cluster 2
index: 31 is in cluster 2
index: 32 is in cluster 2
index: 33 is in cluster 2
index: 34 is in cluster 2
index: 35 is in cluster 2
index: 36 is in cluster 2
index: 37 is in cluster 2
index: 38 is in cluster 2
index: 39 is in cluster 2
index: 40 is in cluster 0
index: 41 is in cluster 1
index: 42 is in cluster 0
index: 43 is in cluster 0
index: 44 is in cluster 1
index: 45 is in cluster 0
index: 46 is in cluster 2
index: 47 is in cluster 1
index: 48 is in cluster 1
index: 49 is in cluster 0
index: 50 is in cluster 2
index: 51 is in cluster 0
index: 52 is in cluster 1
index: 53 is in cluster 2
index: 54 is in cluster 2
index: 55 is in cluster 0
index: 56 is in cluster 2
index: 57 is in cluster 0
index: 58 is in cluster 0
index: 59 is in cluster 0

0 ~ 19がエンタメで20 ~ 39が経済、40 ~ 59がスポーツニュースなのですが経済のニュースは分類できてそうな結果になりました。