python jieba分词如何去除停用词

如题所述

举报该文章

相关建议 2017-04-17

-*- coding: utf-8 -*-
import jieba
import jieba.analyse
import sys
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')

#使用其他编码读取停用词表
#stoplist = codecs.open('../../file/stopword.txt','r',encoding='utf8').readlines()
#stoplist = set(w.strip() for w in stoplist)
#停用词文件是utf8编码
stoplist = {}.fromkeys([ line.strip() for line in open("../../file/stopword.txt") ])

#经过分词得到的应该是unicode编码，先将其转成utf8编码

温馨提示：内容为网友见解，仅供参考

当前网址：https://22.t2y.org/zz/c2i2iihtctc2thfcsx6.html

其他看法

第1个回答 2019-09-06

import jieba

# 创建停用词list
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 对句子进行分词
def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist('./test/stopwords.txt')  # 这里加载停用词的路径
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr

inputs = open('./test/input.txt', 'r', encoding='utf-8')
outputs = open('./test/output.txt', 'w')
for line in inputs:
    line_seg = seg_sentence(line)  # 这里的返回值是字符串
    outputs.write(line_seg + '\n')
outputs.close()
inputs.close()

相似回答

大家正在搜