源码如下:
1 import jieba 2 import io 3 import re 4
5 #jieba.load_userdict(\"E:/xinxi2.txt\") 6 patton=re.compile(r'..') 7
8 #添加字典 9 def add_dict():
10 f=open(\"E:/xinxi2.txt\",\"r+\",encoding=\"utf-8\") #百度爬取的字典11 for line in f:
12 jieba.suggest_freq(line.rstrip(\"\\n\"), True)13 f.close()14
15 #对句⼦进⾏分词16 def cut():17 number=0
18 f=open(\"E:/luntan.txt\",\"r+\",encoding=\"utf-8\") #要处理的内容,所爬信息,CSDN论坛标题19 for line in f:
20 line=seg_sentence(line.rstrip(\"\\n\"))21 seg_list=jieba.cut(line)22 for i in seg_list:
23 print(i) #打印词汇内容24 m=patton.findall(i)
25 #print(len(m)) #打印字符长度26 if len(m)!=0:
27 write(i.strip()+\" \")28 line=line.rstrip().lstrip()
29 print(len(line))#打印句⼦长度30 if len(line)>1:31 write(\"\\n\")32 number+=1
33 print(\"已处理\",number,\"⾏\")34
35 #分词后写⼊
36 def write(contents):
37 f=open(\"E://luntan_cut2.txt\",\"a+\",encoding=\"utf-8\") #要写⼊的⽂件38 f.write(contents)39 #print(\"写⼊成功!\")40 f.close()41
42 #创建停⽤词
43 def stopwordslist(filepath):
44 stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]45 return stopwords46
47 # 对句⼦进⾏去除停⽤词
48 def seg_sentence(sentence):
49 sentence_seged = jieba.cut(sentence.strip())
50 stopwords = stopwordslist('E://stop.txt') # 这⾥加载停⽤词的路径51 outstr = ''
52 for word in sentence_seged:53 if word not in stopwords:54 if word != '\':
55 outstr += word56 #outstr += \" \"57 return outstr58
59 #循环去除、⽆⽤函数60 def cut_all():
61 inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8')62 outputs = open('E//luntan_stop.txt', 'a')63 for line in inputs:
64 line_seg = seg_sentence(line) # 这⾥的返回值是字符串65 outputs.write(line_seg + '\\n')66 outputs.close()67 inputs.close()68
69 if __name__==\"__main__\":70 add_dict()71 cut()
其中停⽤词可⾃⾏百度下载,或者⾃⼰创建⼀个txt⽂件夹,⾃⾏添加词汇⽤换⾏符隔开。效果如下:
import jiebaimport ioimport re
#jieba.load_userdict(\"E:/xinxi2.txt\")patton=re.compile(r'..')#添加字典def add_dict():
f=open(\"E:/xinxi2.txt\",\"r+\",encoding=\"utf-8\") #百度爬取的字典for line in f:
jieba.suggest_freq(line.rstrip(\"\\n\"), True) f.close()
#对句⼦进⾏分词def cut(): number=0
f=open(\"E:/luntan.txt\",\"r+\",encoding=\"utf-8\") #要处理的内容,所爬信息,CSDN论坛标题for line in f:
line=seg_sentence(line.rstrip(\"\\n\")) seg_list=jieba.cut(line) for i in seg_list:
print(i) #打印词汇内容m=patton.findall(i)
#print(len(m)) #打印字符长度if len(m)!=0:
write(i.strip()+\" \") line=line.rstrip().lstrip() print(len(line))#打印句⼦长度if len(line)>1:
write(\"\\n\") number+=1
print(\"已处理\",number,\"⾏\")#分词后写⼊def write(contents):
f=open(\"E://luntan_cut2.txt\",\"a+\",encoding=\"utf-8\") #要写⼊的⽂件f.write(contents)
#print(\"写⼊成功!\")f.close()
#创建停⽤词
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] return stopwords
# 对句⼦进⾏去除停⽤词def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('E://stop.txt') # 这⾥加载停⽤词的路径outstr = ''
for word in sentence_seged: if word not in stopwords: if word != '\':
outstr += word #outstr += \" \"return outstr
#循环去除、⽆⽤函数def cut_all():
inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8') outputs = open('E//luntan_stop.txt', 'a') for line in inputs:
line_seg = seg_sentence(line) # 这⾥的返回值是字符串outputs.write(line_seg + '\\n') outputs.close() inputs.close()
if __name__==\"__main__\": add_dict() cut()
因篇幅问题不能全部显示,请点此查看更多更全内容