搜索
您的当前位置:首页正文

[Python]jieba切词添加字典去除停用词、单字python2020.2.10

来源:好走旅游网
[Python]jieba切词添加字典去除停⽤词、单字python2020.2.10

源码如下:

1 import jieba 2 import io 3 import re 4

5 #jieba.load_userdict(\"E:/xinxi2.txt\") 6 patton=re.compile(r'..') 7

8 #添加字典 9 def add_dict():

10 f=open(\"E:/xinxi2.txt\",\"r+\",encoding=\"utf-8\") #百度爬取的字典11 for line in f:

12 jieba.suggest_freq(line.rstrip(\"\\n\"), True)13 f.close()14

15 #对句⼦进⾏分词16 def cut():17 number=0

18 f=open(\"E:/luntan.txt\",\"r+\",encoding=\"utf-8\") #要处理的内容,所爬信息,CSDN论坛标题19 for line in f:

20 line=seg_sentence(line.rstrip(\"\\n\"))21 seg_list=jieba.cut(line)22 for i in seg_list:

23 print(i) #打印词汇内容24 m=patton.findall(i)

25 #print(len(m)) #打印字符长度26 if len(m)!=0:

27 write(i.strip()+\" \")28 line=line.rstrip().lstrip()

29 print(len(line))#打印句⼦长度30 if len(line)>1:31 write(\"\\n\")32 number+=1

33 print(\"已处理\",number,\"⾏\")34

35 #分词后写⼊

36 def write(contents):

37 f=open(\"E://luntan_cut2.txt\",\"a+\",encoding=\"utf-8\") #要写⼊的⽂件38 f.write(contents)39 #print(\"写⼊成功!\")40 f.close()41

42 #创建停⽤词

43 def stopwordslist(filepath):

44 stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]45 return stopwords46

47 # 对句⼦进⾏去除停⽤词

48 def seg_sentence(sentence):

49 sentence_seged = jieba.cut(sentence.strip())

50 stopwords = stopwordslist('E://stop.txt') # 这⾥加载停⽤词的路径51 outstr = ''

52 for word in sentence_seged:53 if word not in stopwords:54 if word != '\':

55 outstr += word56 #outstr += \" \"57 return outstr58

59 #循环去除、⽆⽤函数60 def cut_all():

61 inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8')62 outputs = open('E//luntan_stop.txt', 'a')63 for line in inputs:

64 line_seg = seg_sentence(line) # 这⾥的返回值是字符串65 outputs.write(line_seg + '\\n')66 outputs.close()67 inputs.close()68

69 if __name__==\"__main__\":70 add_dict()71 cut()

其中停⽤词可⾃⾏百度下载,或者⾃⼰创建⼀个txt⽂件夹,⾃⾏添加词汇⽤换⾏符隔开。效果如下:

import jiebaimport ioimport re

#jieba.load_userdict(\"E:/xinxi2.txt\")patton=re.compile(r'..')#添加字典def add_dict():

f=open(\"E:/xinxi2.txt\",\"r+\",encoding=\"utf-8\") #百度爬取的字典for line in f:

jieba.suggest_freq(line.rstrip(\"\\n\"), True) f.close()

#对句⼦进⾏分词def cut(): number=0

f=open(\"E:/luntan.txt\",\"r+\",encoding=\"utf-8\") #要处理的内容,所爬信息,CSDN论坛标题for line in f:

line=seg_sentence(line.rstrip(\"\\n\")) seg_list=jieba.cut(line) for i in seg_list:

print(i) #打印词汇内容m=patton.findall(i)

#print(len(m)) #打印字符长度if len(m)!=0:

write(i.strip()+\" \") line=line.rstrip().lstrip() print(len(line))#打印句⼦长度if len(line)>1:

write(\"\\n\") number+=1

print(\"已处理\",number,\"⾏\")#分词后写⼊def write(contents):

f=open(\"E://luntan_cut2.txt\",\"a+\",encoding=\"utf-8\") #要写⼊的⽂件f.write(contents)

#print(\"写⼊成功!\")f.close()

#创建停⽤词

def stopwordslist(filepath):

stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] return stopwords

# 对句⼦进⾏去除停⽤词def seg_sentence(sentence):

sentence_seged = jieba.cut(sentence.strip())

stopwords = stopwordslist('E://stop.txt') # 这⾥加载停⽤词的路径outstr = ''

for word in sentence_seged: if word not in stopwords: if word != '\':

outstr += word #outstr += \" \"return outstr

#循环去除、⽆⽤函数def cut_all():

inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8') outputs = open('E//luntan_stop.txt', 'a') for line in inputs:

line_seg = seg_sentence(line) # 这⾥的返回值是字符串outputs.write(line_seg + '\\n') outputs.close() inputs.close()

if __name__==\"__main__\": add_dict() cut()

因篇幅问题不能全部显示,请点此查看更多更全内容

Top