get word-sim by label

This commit is contained in:
yongzhuo 2020-10-26 11:28:46 +08:00
parent b0abb0ffed
commit ceedc0a56a
4 changed files with 220 additions and 0 deletions

View File

@ -0,0 +1,5 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2020/10/26 11:06
# @author : Mo
# @function:

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,101 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2020/10/25 19:49
# @author : Mo
# @function:
# 适配linux
import sys
import os
path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path.append(path_root)
print(path_root)
from utils.text_tools import txtRead, txtWrite, load_json, save_json
import gensim
import json
label_keywords0 = { "娱乐":["电影", "影视", "奥斯卡", "导演", "综艺", "动漫"],
"科技":["数码", "手机", "相机", "像素", "区块链", "人工智能", "数字化"],
"时尚":["时髦", "潮流", "穿搭", "性感", "奢侈品", "首饰"],
"时政":["外交", "政治", "实事", "草案", "中国梦", "小康"],
"家居":["家具", "建材", "厨卫", "涂料", "装修", "地砖", "炉壁"],
"房产":["房价", "房贷", "物业", "楼市", "二手房", "二套房"],
"游戏":["玩家", "网游", "手游", "技能", "王者荣耀", "出装"],
"体育":["比赛", "NBA", "体育讯", "得分", "足球", "竞赛"],
"财经":["基金", "投资", "股票", "分红", "理财", "保险"],
"教育":["考试", "学生", "英语", "四六级", "早教", "试卷"],
}
label_keywords1 = {
"教育":["教育", "语文", "体育教师","双创", "冰雪教育","老师","GIA","师范", "命题", "在线教育", "作文","早教",
"中职","张老师","学生","汉语言","试卷","支教团","人大附中","研学游","教师资格"],
"家居": ["欧派","科勒","樱雪","SUNIT世集","涂料","油烟机","电梯","灶具", "实地","板业", "风扇", "沃莱菲",
"花岗岩","岩板","玻璃胶","消毒柜","席梦思","水磨石", "清除率","号线", "床垫", "地板", "乳胶", "洗衣机", "红木","甲醛"],
"时尚": ["贝雷帽","麦肯齐", "连裤袜", "人台", "渔夫帽", "吊饰", "发饰", "白衬衫", "古驰", "派克", "切工"],
"时政": ["经开区", "", "科工", "抗旱", "作战", "立法", "战略", "用电量", "习仲勋", "党费", "巡视", "监审", "举报人", "行政"],
"科技": ["区块链", "佳能EOS", "暗网", "折叠屏", "ZooKeeper", "TCL", "数据管理", "PoS", "波场", "频谱", "机房", "PoW",
"一加", "公共电话", "互联网", "无人驾驶", "微信", "拼多多", "手机", "IaaS", "抖音", "HDMI", "可信", "人脸识别",
"PIN", "中兴", "个人信息", "小米", "B2B", "CTR", "平板", "应用程序", "通信协议", "挖矿",
"算力", "Wifi", "K8S", "分布式", "数据线"],
"房产": ["甲方", "乙方", "窗洞", "惠而浦", "燕郊", "LPR", "LPS", "天恒乐墅", "开发商", "恒大", "招商会", "买受人", "创客",
"住房", "购房者", "配租", "退房", "京兆", "公府", "城镇化"],
"财经": ["", "中港", "Q3","pct", "市净率", "ROIC", "大豆", "保险机构", "债权人", "GNP", "国资", "龙头股", "PTA", "理财产品", "LPG", "转增", "缩股",
"降息", "交割", "破发", "顺差", "研报", "停盘", "SPV", "央票", "生产总值", "操盘手", "瑞典克朗", "新加坡元", "SDR", "含税", "下调", "次级", "上涨",
"增速", "概念股", "除息", "除权", "薪资", "贸易顺差", "指标股", "非流通股", "贸易逆差"],
"游戏": ["王者", "首充", "小邪", "Altman", "XiXi", "3DO", "Ciwei", "Evc", "50pm", "德鲁依", "精魄", "晶灵", "COSer",
"雷克萨", "GANK", "小汐", "血露", "龙腾组", "指族", "战训队", "同乐会", "千人国战", "千人战"],
"体育": ["女排", "兵乓球", "跳水", "丁俊晖", "李元伟", "李彤", "萨沃", "张岐", "霍斯金", "奥多姆", "汪嵩", "广东队",
"快船队", "马连保", "UTSA", "钟诚", "曾文鼎", "小斯", "孙明明", "山东队", "八一队", "辽足", "国奥队",
"三连客","小牛队", "进球", "肘击", "沙帅", "赛风"],
"娱乐": ["峨影厂", "地戏", "墨攻", "花絮", "DMAX", "选角", "杀青", "拍戏", "配音", "绯闻", "离婚", "表白",
"蒋庆泉", "赵宁", "王世贞", "陈乾", "蔡荣名", "洪炉", "文玲姐", "温超", "白百何", "杨丽坤",
"林权泽", "王天冉", "严孝国", "蒋利", "傅东", "尚玟", "李蜜", "王雅萱", "滕华涛", "狄娜", "微博选角", "墨攻", "王小贱",
"唐一菲", "柳导", "隆裕太后"]
}
label_keywords = {"娱乐": ["电影", "影视", "奥斯卡", "导演"],
"科技": ["数码", "手机", "相机", "像素"],
"时尚": ["时髦", "潮流", "化妆", "性感"],
"时政": ["外交", "政治", "人大", "草案", "致辞", "审查", "督察组", "贯彻", "纪委", "劳动局"],
"家居": ["家具", "建材", "厨卫", "涂料"],
"房产": ["新房", "房贷", "物业", "楼市"],
"游戏": ["玩家", "网游", "手游", "页游"],
"体育": ["比赛", "欧冠", "排球", "得分"],
"财经": ["基金", "投资", "股票", "分红"],
"教育": ["考试", "学生", "数学", "高考"],
}
# 穿搭,房价,体育讯
path_w2v = "sgns.wiki.word"
# path_w2v = "JDAI-Word-Embedding.txt"
# path_w2v = "Tencent_AILab_ChineseEmbedding.txt"
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(path_w2v, binary=False, # limit=100000
) # limit=100000)
print("load ok!")
topn = 320
res = []
lkk = list(label_keywords.keys())
for label in lkk:
key_words = label_keywords[label]
key_words = [label] + key_words
for word in key_words:
sim_word = None
try:
sim_word = w2v_model.most_similar(word, topn=topn)
except Exception as e:
print(word)
continue
if sim_word:
line_dict = {"type": label, "word": word, "topk": sim_word}
line_str = json.dumps(line_dict, ensure_ascii=False) + "\n"
res.append(line_str)
txtWrite(res, "ccks_news_2020_keyword_sim_sgns.json")
mam = 0
# nohup python keyword_sim.py > sim.log 2>&1 &

View File

@ -0,0 +1,108 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2020/10/25 11:07
# @author : Mo
# @function: rule-word-freq, 统计各类别独有词汇的词频等
# 适配linux
import sys
import os
path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path.append(path_root)
print(path_root)
# macadam
from utils.text_tools import jieba_cut, txtRead, txtWrite, load_json, save_json
from conf.path_config import stop_words_path
from collections import Counter, OrderedDict
from tqdm import tqdm
import jieba
import json
import copy
# 停用词列表默认使用hanlp停用词表
f_stop = open(stop_words_path, "r", encoding="utf-8")
stop_words = []
for stop_word in f_stop.readlines():
stop_words.append(stop_word.strip())
# stop_words = ["\t"]
def is_total_number(text: str) -> bool:
"""
judge is total chinese or not, 判断是不是全是数字
Args:
text: str, eg. "macadam, 碎石路"
Returns:
bool, True or False
"""
for word in text:
if word not in "0123456789.%":
return False
return True
def statistics_keyword_by_label(path, rate=1):
"""
judge is total chinese or not, 判断是不是全是数字
Args:
path: str, eg. "train.json"
rate: float, eg. 0.75
Returns:
None
"""
datas = txtRead(path)
lwd = {}
for i in tqdm(range(len(datas)), desc="jieba cut and statistics: "):
# 从标准文档里边获取文本, 切词处理
d = datas[i]
d_json = json.loads(d)
text = d_json.get("x", {}).get("text")
label = d_json.get("y")
word_list = list(jieba.cut(text))
# 去除 停用词、全数字、1个字
word_list = [wl for wl in word_list if wl not in stop_words and not is_total_number(wl) and len(wl) >= 2]
# 词频统计(类别内)
word_freq_dict = dict(Counter(word_list))
if label not in lwd:
lwd[label] = word_freq_dict
else:
lwd[label].update(word_freq_dict)
# 取范围, 排序
lwd_keys = list(lwd.keys())
lwd_soft = [sorted(lwd[l].items(), key=lambda x: x[1], reverse=True) for l in lwd_keys]
lwd_soft_rate = [s[:int(len(s) * rate)] for s in lwd_soft]
label_word_dict = {lwd_keys[i]: OrderedDict(lwd_soft_rate[i]) for i in range(len(lwd_keys))}
print("cut ok!")
# 获取每个类独有的词汇
label_keys = set(list(label_word_dict.keys()))
label_words = {}
for key in label_keys:
key_dict = set(list(label_word_dict[key].keys()))
keys_other = copy.deepcopy(label_keys)
keys_other.discard(key)
# 其他类别的所有词汇
kos = set()
for ko in keys_other:
ko_dict = set(list(label_word_dict[ko].keys()))
kos = kos | ko_dict
# 获取独有的词汇
key_public = kos & key_dict
key_label = key_dict - key_public
label_word_freq = {kl:label_word_dict[key][kl] for kl in key_label}
label_words[key] = label_word_freq
save_json(label_words, "label_keyword_unique.json")
if __name__ == '__main__':
path = "ccks_news_2020.json"
statistics_keyword_by_label(path, rate=1)
mm = 0