python 按照jieba分词后,再根据字典的词频进行排序输出
dict.txt
迅雷不及掩耳盗铃之势 1 掩耳盗铃 2 铃儿响叮当 3 当仁不让 5 让世界充满爱 3 让世界充满爱 5 迅雷不及 0 迅雷 0 掩耳 0 盗铃 0
实现代码
# -*- ecoding: utf-8 -*- # @ModuleName: test002 # @Function: # @Author: darling # @Time: 2022-05-05 20:01 import jieba from loguru import logger jieba.set_dictionary(./dict.txt) # 指定dict.txt加载路径,为了方便部署,使用相对路径。 jieba.initialize() # jieba库初始化。 def read_headers(file_name): with open(file_name, 'r', encoding='UTF-8') as file: lines = file.readlines() for line in lines: if line == '\n': continue # 这里用的是[: ]进行拆分,因为value中也会存在冒号 ll = line.split(' ') key = ll[0].strip() val = ll[1].strip().replace('\n', '') if val == '0': continue header[key] = int(val) return header # 拆词 def split_words(title): split_dic = {} words = [] try: logger.info('输入的标题为:{}', title) lcuts = jieba.lcut(title, cut_all=True) # 排除单个字的,保留两个字以上的词语 list_lcuts = [x for x in lcuts if len(x) >= 2] # 去重复性词语 list_lcuts = list(set(list_lcuts)) logger.info('lcuts可拆分为:{}', list_lcuts) for list_lcut in list_lcuts: # 获得字典的词频 词频不唯一 val = header.get(list_lcut) # 生成新的字典 split_dic[list_lcut] = val new_dic = sorted(split_dic.items(), key=lambda x: -x[1]) print(new_dic) # words = [x for x in new_dic if x[0]] for splits in new_dic: words.append(splits[0]) logger.info('词频级排序后:{}', words) # return list_words except Exception as e: logger.info('拆词异常:{}', e) def sort1(arr): return arr.sort() if __name__ == __main__: header = {} read_headers('./dict.txt') split_words('迅雷不及掩耳盗铃儿响叮当仁不让世界充满爱之势,迅雷不及掩耳盗铃之势')
拓展
对字典,列表进行排序,升序/降序的方式