from jpype import *
from smr import SimpleMapReduce
import string
import multiprocessing
import codecs
import os
import glob
import time
from collections import defaultdict
result = defaultdict(int)
startJVM(getDefaultJVMPath(), r"-Djava.class.path=C:\hanlp\hanlp-1.3.2.jar;C:\hanlp", "-Xms1g", "-Xmx1g")
HanLP = JClass('com.hankcs.hanlp.HanLP')
chinese_punctuation = r"?,。、‘’“”~·!()./"
punctuation = chinese_punctuation + string.punctuation
def CovertJlistToPlist(jList):
ret = []
if jList is None:
return ret
for i in range(jList.size()):
ret.append(str(jList.get(i)))
return ret
def file_to_words(filename):
"""Read a file and return a sequence of (word, occurances) values.
"""
TR = str.maketrans(punctuation, ' ' * len(punctuation))
print(multiprocessing.current_process().name, 'reading', filename)
output = []
with codecs.open(filename, encoding='utf-8') as f:
for line in f:
line_list = line.strip().split('\t')
if len(line_list) != 4:
continue
*1, *2, *3, *4 = line_list
output.append( (question, 1) )
line = question.translate(TR)
segment = HanLP.segment(line)
lists = CovertJlistToPlist(segment)
for words in lists:
output.append( (word, 1) )
return output
def count_words(item):
"""Convert the partitioned data for a word to a
tuple containing the word and the number of occurances.
"""
word, occurances = item
return (word, sum(occurances))
if __name__ == '__main__':
import operator
start = time.time()
input_files = glob.glob('E:\\**_data\\*.txt')
mapper = SimpleMapReduce(file_to_words, count_words)
word_counts = mapper(input_files)
word_counts.sort(key=operator.itemgetter(1))
word_counts.reverse()
shutdownJVM()
print('\nTOP 2000 WORDS BY FREQUENCY\n')
top20 = word_counts[:2000]
longest = max(len(word) for word, count in top20)
for word, count in top20:
print('%-*s: %5s' % (longest+1, word, count))
end = time.time()
print("耗时:{}".format(str(end-start)))