defget_jiebaword(): jieba.load_userdict('word.txt') # with open('keyword.txt') as fr: withopen('keyword-1.txt') as fr: lines = fr.readlines()
jiebaword = [] for line in lines: line = line.strip('\n') # 默认精确模式 seg_list = jieba.cut(line, cut_all=False) word = "/".join(seg_list) jiebaword.append(word) return jiebaword
# 去除停用词 defclean_stopword(jiebaword,stopword): fw = open('clean-word.txt','a+',encoding='utf-8') for words in jiebaword: words = words.split('/') for word in words: if word notin stopword: fw.write(word+'\t')
defde_duplication(): outfile = open('clean-word-2.txt','a+',encoding='utf-8') withopen('clean-word.txt') as fr: words = fr.readlines() result = set() for line in words: if line notin result: result.add(line) outfile.write(line) outfile.close()
defget_cluster(tfidf_arr,k): kmeans = KMeansClusterer(num_means=k, distance=cosine_distance) # 分成k类,使用余弦相似分析 kmeans.cluster(tfidf_arr) # 获取分类 kinds = pd.Series([kmeans.classify(i) for i in tfidf_arr]) fw = open('k-means.txt', 'a+', encoding='utf-8') for i, v in kinds.items(): fw.write(str(i) + '\t' + str(v) + '\n') fw.close()
for line in lines: line = line.strip('\n') line = line.split('\t') index_cluser.append(line) # index_cluser[i][j]表示第i行第j列 withopen('clean-word-2.txt', "r", encoding='utf-8') as fr: lines = fr.readlines()
for index,line inenumerate(lines): for i inrange(len(lines)): ifstr(index) == index_cluser[i][0]: fw = open('title' + index_cluser[i][1] + '.txt', 'a+', encoding='utf-8') fw.write(line) fw.close()