


- 経済とスポーツのニュースをテキストで保存しておく
- Pythonでファイルを読み込む
- 読み込んだ文章を形態素解析し文書ごとで使われている単語の辞書を作成する
- 全ての文書で指定回数内使われている単語で辞書を作る
- ↑で作成した辞書を使って文書毎でベクトル化してk平均法の入力データを作成する
- TensorFlowでk平均法を実行する
- 文章のクラスタリング結果を表示する



import numpy as np    
import tensorflow as tf    
from janome.tokenizer import Tokenizer    
from janome.analyzer import Analyzer    
from janome.charfilter import *    
from janome.tokenfilter import *    
def wordToDicGen(tokenizer, char_filters, token_filters):    
    def wordToDic(text):    
        dic = dict()    
        for token in Analyzer(char_filters, tokenizer, token_filters).analyze(text):    
            dic[token.base_form] = 1    
        return dic    
    return wordToDic    
def keyWithIndex(dic: dict):    
    retdic = dict()    
    i = 0    
    for key in sorted(dic.keys()):    
        retdic[key] = i    
        i += 1    
    return retdic    
def dicToVec(index_dic, target_dic):    
    resultVec = np.zeros(len(index_dic))    
    for key in sorted(target_dic):    
        if key in index_dic:    
            resultVec[index_dic[key]] = 1.0    
    return resultVec    
def mergeCountDic(someDic, newDic):    
    for key in newDic:    
        if key in someDic:    
            someDic[key] = someDic[key] + 1    
            someDic[key] = 1    
    return someDic    
def dicCountFilter(min, max,  dic):    
    resultDic = dict()    
    for key in dic:    
        if dic[key] >= min and dic[key] < max:    
            resultDic[key] = 1    
    return resultDic    
def textToVec(textList: list, wordToDic):    
    someDic = dict()    
    textDicList = list()    
    for text in textList:    
        textDic = wordToDic(text)    
        someDic = mergeCountDic(someDic, textDic)    
    indexDic = keyWithIndex(dicCountFilter(4, 8, someDic))    
    return indexDic, np.vstack((list(map(lambda textDic: dicToVec(indexDic, textDic), textDicList))))    
char_filters = [UnicodeNormalizeCharFilter()]    
tokenizer = Tokenizer()    
token_filters = [CompoundNounFilter(), POSStopFilter(['記号','助詞', '助動詞', '助動詞']), LowerCaseFilter()]    
wordToDic = wordToDicGen(tokenizer, char_filters, token_filters)    
keizaiNews = list()    
for i in range(10):    
    keizaiNews.append(open("news/keizai_"  + '{:02}'.format(i+1) + ".txt", "r").read())    
spoNews = list()    
for i in range(10):    
    spoNews.append(open("news/spo_"  + '{:02}'.format(i+1) + ".txt", "r").read())    
someList = list()    
indexDic, textVec = textToVec(someList, wordToDic)    


{'いう': 0,    
 'いく': 1,    
 'おる': 2,    
 'かける': 3,    
 'この': 4,    
 'せる': 5,    
 'そして': 6,    
 'その': 7,    
 'できる': 8,    
 'もの': 9,    
 'よう': 10,    
 'られる': 11,    
 'わけ': 12,    
 'チーム': 13,    
 '一方': 14,    
 '中': 15,    
 '今': 16,    
 '出る': 17,    
 '出場': 18,    
 '受ける': 19,    
 '同じ': 20,    
 '多く': 21,    
 '大会': 22,    
 '強い': 23,    
 '思う': 24,    
 '日本': 25,    
 '決める': 26,    
 '決勝トーナメント': 27,    
 '発表': 28,    
 '第': 29,    
 '結果': 30,    
 '行う': 31,    
 '試合': 32,    
 '語る': 33,    
 '選手': 34,    
 '開く': 35,    
 '開幕': 36}    


array([1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,    
       1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1.,    
       1., 0., 0.])    



def dicCountFilter(min, max,  dic):    
    resultDic = dict()    
    for key in dic:    
        if dic[key] >= min and dic[key] < max:    
            resultDic[key] = 1    
    return resultDic    



def input_fn_gen(textVec):    
    def input_fn():    
        return tf.train.limit_epochs(    
            tf.convert_to_tensor(textVec, dtype=tf.float32), num_epochs=30)    
    return input_fn    
def train_model(num_clusters, input_fn):    
    kmeans = tf.contrib.factorization.KMeansClustering(num_clusters=num_clusters, use_mini_batch=False)    
    # train    
    num_iterations = 10    
    previous_centers = None    
    for _ in range(num_iterations):    
        cluster_centers = kmeans.cluster_centers()    
        previous_centers = cluster_centers    
    return kmeans, cluster_centers    
def showResult(kmeans, cluster_centers, input_fn, textVec):    
    # map the input points to their clusters    
    cluster_indices = list(kmeans.predict_cluster_index(input_fn))    
    for i, point in enumerate(textVec):    
        cluster_index = cluster_indices[i]    
        print('index:', i, 'is in cluster', cluster_index)    
input_fn = input_fn_gen(textVec)    
kmeans, cluster_centers =  train_model(2, input_fn)    
showResult(kmeans, cluster_centers, input_fn, textVec)    


index: 0 is in cluster 0    
index: 1 is in cluster 0    
index: 2 is in cluster 0    
index: 3 is in cluster 0    
index: 4 is in cluster 0    
index: 5 is in cluster 0    
index: 6 is in cluster 0    
index: 7 is in cluster 0    
index: 8 is in cluster 0    
index: 9 is in cluster 0    
index: 10 is in cluster 1    
index: 11 is in cluster 1    
index: 12 is in cluster 0    
index: 13 is in cluster 0    
index: 14 is in cluster 1    
index: 15 is in cluster 0    
index: 16 is in cluster 0    
index: 17 is in cluster 1    
index: 18 is in cluster 1    
index: 19 is in cluster 0    

文書の0 ~ 9が経済のニュースで10 ~ 19がスポーツニュースなのですが、経済は全て0に判定されスポーツは0の判定が5で1の判定が5となり、誤文類が多い結果となりました。

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,    
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0.,    
       0., 0., 0.])    

4, 14, 25, 29, 30, 31番目の単語が含まれているので、かける, 一方, 日本, 第, 結果, 行うが含まれているようですが、確かにこれではスポーツニュースに分類するのは難しそうに思います。


|D|は文書数で、d ∋ t_iは全文書中に単語t_iが現れる文書数になります。これよりidfは文書に稀に現れるほど大きな値になることがわかります。 これを使うようにする場合、単語をフィルターする処理は以下のような実装になりました。

def dicIdfFilter(dic, doc_num):
    resultDic = dict()
    calCache = dict()
    for key in dic:
        if dic[key] not in calCache:
            calCache[dic[key]] = math.log(float(doc_num / dic[key]), math.e)
        if dic[key] >= 2 and calCache[dic[key]] >= 1.5:
            resultDic[key] = calCache[dic[key]]
    return resultDic

def textToVec(textList: list, wordToDic):
    someDic = dict()
    textDicList = list()
    for text in textList:
        textDic = wordToDic(text)
        someDic = mergeCountDic(someDic, textDic)
    indexDic = keyWithIndex(dicIdfFilter(someDic, len(textList)))
    return indexDic, np.vstack((list(map(lambda textDic: dicToVec(indexDic, textDic), textDicList))))


index: 0 is in cluster 0
index: 1 is in cluster 2
index: 2 is in cluster 0
index: 3 is in cluster 1
index: 4 is in cluster 2
index: 5 is in cluster 0
index: 6 is in cluster 0
index: 7 is in cluster 0
index: 8 is in cluster 0
index: 9 is in cluster 0
index: 10 is in cluster 0
index: 11 is in cluster 0
index: 12 is in cluster 0
index: 13 is in cluster 0
index: 14 is in cluster 0
index: 15 is in cluster 0
index: 16 is in cluster 0
index: 17 is in cluster 2
index: 18 is in cluster 0
index: 19 is in cluster 0
index: 20 is in cluster 2
index: 21 is in cluster 2
index: 22 is in cluster 2
index: 23 is in cluster 2
index: 24 is in cluster 2
index: 25 is in cluster 2
index: 26 is in cluster 2
index: 27 is in cluster 2
index: 28 is in cluster 2
index: 29 is in cluster 2
index: 30 is in cluster 2
index: 31 is in cluster 2
index: 32 is in cluster 2
index: 33 is in cluster 2
index: 34 is in cluster 2
index: 35 is in cluster 2
index: 36 is in cluster 2
index: 37 is in cluster 2
index: 38 is in cluster 2
index: 39 is in cluster 2
index: 40 is in cluster 0
index: 41 is in cluster 1
index: 42 is in cluster 0
index: 43 is in cluster 0
index: 44 is in cluster 1
index: 45 is in cluster 0
index: 46 is in cluster 2
index: 47 is in cluster 1
index: 48 is in cluster 1
index: 49 is in cluster 0
index: 50 is in cluster 2
index: 51 is in cluster 0
index: 52 is in cluster 1
index: 53 is in cluster 2
index: 54 is in cluster 2
index: 55 is in cluster 0
index: 56 is in cluster 2
index: 57 is in cluster 0
index: 58 is in cluster 0
index: 59 is in cluster 0

0 ~ 19がエンタメで20 ~ 39が経済、40 ~ 59がスポーツニュースなのですが経済のニュースは分類できてそうな結果になりました。