Add files via upload

2019-06-13 23:18:46 +08:00 · 2019-06-13 23:18:46 +08:00 · dcc40f4e43
commit dcc40f4e43
parent 339685555f
36 changed files with 17401 additions and 0 deletions
--- a/keras_textclassification/init.py
+++ b/keras_textclassification/init.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/13 16:13
+# @author   :Mo
+# @function :
--- a/keras_textclassification/base/init.py
+++ b/keras_textclassification/base/init.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 11:24
+# @author   :Mo
+# @function :
--- a/keras_textclassification/base/embedding.py
+++ b/keras_textclassification/base/embedding.py
@ -0,0 +1,217 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 11:29
+# @author   :Mo
+# @function :embeddings of model, base embedding of random word2vec or bert
+
+
+import codecs
+import os
+
+import jieba
+import keras_bert
+from gensim.models import KeyedVectors
+from keras.engine import Layer
+from keras.layers import Concatenate
+from keras.layers import Embedding
+from keras.models import Input
+from keras.models import Model
+
+
+class NonMaskingLayer(Layer):
+    """
+    fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978
+    thanks for https://github.com/jacoxu
+    """
+
+    def __init__(self, **kwargs):
+        self.supports_masking = True
+        super(NonMaskingLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        pass
+
+    def compute_mask(self, input, input_mask=None):
+        # do not pass the mask to the next layers
+        return None
+
+    def call(self, x, mask=None):
+        return x
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+class BaseEmbedding:
+    def __init__(self, hyper_parameters):
+        self.corpus_path = hyper_parameters['embedding'].get('corpus_path', 'corpus_path') # 'dict' or 'corpus'
+        self.level_type = hyper_parameters['embedding'].get('level_type', 'char') # 还可以填'word'
+        self.vocab_size = hyper_parameters['embedding'].get('vocab_size', 30000) #
+        self.embed_size = hyper_parameters['embedding'].get('embed_size', 300) # word 150万所以300；中文char 2000所以30左右
+        self.len_max = hyper_parameters['embedding'].get('len_max', 50) # 建议25-50
+        self.ot_dict = { 'PAD': 0,
+                         'UNK': 1,
+                         'BOS': 2,
+                         'EOS': 3, }
+        self.deal_corpus()
+        self.build()
+
+    def deal_corpus(self):
+        pass
+
+    def build(self):
+        self.token2idx = {}
+        self.idx2token = {}
+
+    def sentence2idx(self, text):
+        text = str(text)
+        if self.level_type == 'char':
+            text = list(text.replace(' ', '').strip())
+        elif self.level_type == 'word':
+            text = list(jieba.cut(text, cut_all=False, HMM=True))
+        else:
+            raise RuntimeError("your input level_type is wrong, it must be 'word' or 'char'")
+        text = [text_one for text_one in text]
+        len_leave = self.len_max - len(text)
+        if len_leave >= 0:
+            text_index = [self.token2idx[text_char] if text_char in self.token2idx else self.token2idx['UNK'] for
+                          text_char in text] + [self.token2idx['PAD'] for i in range(len_leave)]
+        else:
+            text_index = [self.token2idx[text_char] if text_char in self.token2idx else self.token2idx['UNK'] for
+                          text_char in text[0:self.len_max]]
+        return text_index
+
+    def idx2sentence(self, idx):
+        assert type(idx) == list
+        text_idx = [self.idx2token[id] if id in self.idx2token else self.idx2token['UNK'] for id in idx]
+        return "".join(text_idx)
+
+
+class RandomEmbedding(BaseEmbedding):
+    def __init__(self, hyper_parameters):
+        super().__init__(hyper_parameters)
+        # self.path = hyper_parameters.get('corpus_path', path_embedding_random_char)
+
+    def deal_corpus(self):
+        token2idx = self.ot_dict.copy()
+        count = 3
+        if 'term_char' in self.corpus_path:
+            with open(file=self.corpus_path, mode='r', encoding='utf-8') as fd:
+                while True:
+                    term_one = fd.readline()
+                    if not term_one:
+                        break
+                    term_one = term_one.strip()
+                    if term_one not in token2idx:
+                        count = count + 1
+                        token2idx[term_one] = count
+
+        elif 'corpus' in self.corpus_path:
+            with open(file=self.corpus_path, mode='r', encoding='utf-8') as fd:
+                terms = fd.readlines()
+                for term_one in terms:
+                    if self.level_type == 'char':
+                        text = list(term_one.replace(' ', '').strip())
+                    elif self.level_type == 'word':
+                        text = list(jieba.cut(term_one, cut_all=False, HMM=True))
+                    else:
+                        raise RuntimeError("your input level_type is wrong, it must be 'word' or 'char'")
+                    for text_one in text:
+                        if term_one not in token2idx:
+                            count = count + 1
+                            token2idx[text_one] = count
+        else:
+            raise RuntimeError("your input level_type is wrong, it must be 'dict' or 'corpus'")
+        self.token2idx = token2idx
+        self.idx2token = {}
+        for key, value in self.token2idx.items():
+            self.idx2token[value] = key
+
+    def build(self, **kwargs):
+        self.vocab_size = len(self.token2idx)
+        self.input = Input(shape=(self.len_max, ), dtype='int32')
+        self.output = Embedding(self.vocab_size,
+                            self.embed_size,
+                            input_length=self.len_max,
+                            trainable=True)(self.input)
+        self.model = Model(self.input, self.output)
+
+
+class WordEmbedding(BaseEmbedding):
+    def __init__(self, hyper_parameters):
+        super().__init__(hyper_parameters)
+        # self.path = hyper_parameters.get('corpus_path', path_embedding_vector_word2vec)
+
+    def build(self, **kwargs):
+        self.embedding_type = 'word2vec'
+        print("load word2vec start!")
+        self.key_vector = KeyedVectors.load_word2vec_format(self.corpus_path, **kwargs)
+        print("load word2vec end!")
+        self.embed_size = self.key_vector.vector_size
+
+        self.token2idx = self.ot_dict.copy()
+        embedding_matrix = []
+        for word in self.key_vector.index2entity:
+            self.token2idx[word] = len(self.token2idx)
+            embedding_matrix.append(self.key_vector[word])
+        self.token2idx = self.token2idx
+        self.idx2token = {}
+        for key, value in self.token2idx.items():
+            self.idx2token[value] = key
+
+        len_token2idx = len(self.token2idx)
+
+        input_layer = Input(shape=(self.len_max,), dtype='int32')
+
+        output = Embedding(len_token2idx,
+                            self.embed_size,
+                            input_length=self.len_max,
+                            weights=[embedding_matrix],
+                            trainable=False)(input_layer)
+        self.model = Model(input_layer, output)
+
+
+class BertEmbedding(BaseEmbedding):
+    def __init__(self, hyper_parameters):
+        super().__init__(hyper_parameters)
+        # self.path = hyper_parameters.get('corpus_path', path_embedding_bert)
+
+    def build(self):
+        self.embedding_type = 'bert'
+        config_path = os.path.join(self.corpus_path, 'bert_config.json')
+        check_point_path = os.path.join(self.corpus_path, 'bert_model.ckpt')
+        dict_path = os.path.join(self.corpus_path, 'vocab.txt')
+        model = keras_bert.load_trained_model_from_checkpoint(config_path,
+                                                              check_point_path,
+                                                              seq_len=self.len_max)
+        num_layers = len(model.layers)
+        features_layers = [model.get_layer(index=num_layers-1+idx*8).output\
+                            for idx in range(-3, 1)]
+        embedding_layer = Concatenate(features_layers)
+        output_layer = NonMaskingLayer()(embedding_layer)
+        self.model = Model(model.inputs, output_layer)
+
+        self.embedding_size = self.model.output_shape[-1]
+        word2idx = {}
+        with open(dict_path, 'r', encoding='utf-8') as f:
+            words = f.read().splitlines()
+        for idx, word in enumerate(words):
+            word2idx[word] = idx
+        for key, value in self.ot_dict.items():
+            word2idx[key] = word2idx[value]
+
+        self.token2idx = word2idx
+
+        # reader tokenizer
+        self.token_dict = {}
+        with codecs.open(dict_path, 'r', 'utf8') as reader:
+            for line in reader:
+                token = line.strip()
+                self.token_dict[token] = len(self.token_dict)
+
+        self.tokenizer = keras_bert.Tokenizer(self.token_dict)
+
+    def sentence2idx(self, text):
+        input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.len_max)
+        input_mask = [0 if ids == 0 else 1 for ids in input_id]
+        return input_id, input_type_id, input_mask
--- a/keras_textclassification/base/graph.py
+++ b/keras_textclassification/base/graph.py
@ -0,0 +1,128 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:51
+# @author   :Mo
+# @function :graph of base
+
+
+from keras.callbacks import ModelCheckpoint, EarlyStopping
+from keras.optimizers import Adam
+from keras import backend as K
+
+import tensorflow as tf
+import numpy as np
+
+# keras, tensorflow控制GPU使用率等
+config = tf.ConfigProto()
+config.gpu_options.per_process_gpu_memory_fraction = 0.46
+sess = tf.Session(config=config)
+K.set_session(sess)
+
+class graph:
+    def __init__(self, hyper_parameters):
+        """
+            模型初始化
+        :param hyper_parameters:json, json['model'] and json['embedding']  
+        """
+        hyper_parameters_model = hyper_parameters['model']
+        self.label = hyper_parameters_model.get('label', 2)  # 类型
+        self.batch_size = hyper_parameters_model.get('batch_size', 32)  # 批向量
+        self.embed_size = hyper_parameters_model.get('embed_size', 300)  # 嵌入层尺寸
+        self.filters = hyper_parameters_model.get('filters', [3, 4, 5])  # 卷积核大小
+        self.kernel_size = hyper_parameters_model.get('kernel_size', 300)  # 核长
+        self.channel_size = hyper_parameters_model.get('channel_size', 1)  # 通道数
+        self.dropout = hyper_parameters_model.get('dropout', 0.5)          # dropout层系数，舍弃
+        self.decay_step = hyper_parameters_model.get('decay_step', 100)    # 衰减步数
+        self.decay_rate = hyper_parameters_model.get('decay_rate', 0.9)    # 衰减系数
+        self.epochs = hyper_parameters_model.get('epochs', 20)             # 训练轮次
+        self.len_max = hyper_parameters_model.get('len_max', 50)           # 文本最大长度
+        self.vocab_size = hyper_parameters_model.get('vocab_size', 20000)  # 字典词典大小
+        self.lr = hyper_parameters_model.get('lr', 1e-3)                   # 学习率
+        self.l2 = hyper_parameters_model.get('l2', 1e-6)                   # l2正则化系数
+        self.activate_classify = hyper_parameters_model.get('activate_classify', 'softmax')  # 分类激活函数,softmax或者signod
+        self.embedding_type = hyper_parameters_model.get('embedding_type', 'word2vec')  #词嵌入方式，可以选择'bert'、'gpt-2'、'word2vec'或者'None'
+        self.is_training = hyper_parameters_model.get('is_training', False)  # 是否训练
+        self.model_path = hyper_parameters_model.get('model_path', "model")  # 模型地址
+        self.create_model(hyper_parameters)
+        if self.is_training:
+            self.create_compile()
+
+
+    def create_model(self, hyper_parameters):
+        """
+            构建神经网络
+        :param hyper_parameters: json，超参数
+        :return:  
+        """
+        # embeddings选择
+        Embeddings = None
+        if self.embedding_type == 'random':
+            from keras_textclassification.base.embedding import RandomEmbedding as Embeddings
+        elif self.embedding_type == 'char':
+            from keras_textclassification.base.embedding import CharEmbedding3 as Embeddings
+        elif self.embedding_type == 'bert':
+            from keras_textclassification.base.embedding import BertEmbedding as Embeddings
+        elif self.embedding_type == 'word2vec':
+            from keras_textclassification.base.embedding import WordEmbedding as Embeddings
+        else:
+            raise RuntimeError("your input embedding_type is wrong, it must be 'random'、 'bert' or 'word2vec")
+        # 构建网络层
+        self.word_embedding = Embeddings(hyper_parameters=hyper_parameters)
+        self.model = None
+
+    def callback(self):
+        """
+          评价函数、早停
+        :return: 
+        """
+        cb_em = [ EarlyStopping(monitor='val_loss', mode='min', min_delta=1e-8, patience=3),
+                  ModelCheckpoint(monitor='val_loss', mode='min', filepath=self.model_path, verbose=1,
+                                  save_best_only=True, save_weights_only=False),]
+        return cb_em
+
+    def create_compile(self):
+        """
+          构建优化器、损失函数和评价函数
+        :return: 
+        """
+        self.model.compile(optimizer=Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
+                           loss='categorical_crossentropy',
+                           metrics=['accuracy'])
+
+    def fit(self, x_train, y_train, x_dev, y_dev):
+        """
+            训练
+        :param x_train: 
+        :param y_train: 
+        :param x_dev: 
+        :param y_dev: 
+        :return: 
+        """
+        self.model.fit(x_train, y_train, batch_size=self.batch_size,
+                       epochs=self.epochs, validation_data=(x_dev, y_dev),
+                       shuffle=True,
+                       callbacks=self.callback())
+
+    def load_model(self):
+        """
+          模型下载
+        :return: 
+        """
+        print("load_model start!")
+        self.model.load_weights(self.model_path)
+        print("load_model end!")
+
+    def predict(self, sen):
+        """
+          预测
+        :param sen: 
+        :return: 
+        """
+        if type(sen)==np.ndarray:
+            sen = sen
+        elif type(sen)==list:
+            sen = np.array([sen])
+        else:
+            raise RuntimeError("your input sen is wrong, it must be type of list or np.array")
+        return self.model.predict(sen)
+
--- a/keras_textclassification/conf/init.py
+++ b/keras_textclassification/conf/init.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/5 21:04
+# @author   :Mo
+# @function :
--- a/keras_textclassification/conf/path_config.py
+++ b/keras_textclassification/conf/path_config.py
@ -0,0 +1,23 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/5 21:04
+# @author   :Mo
+# @function :file of path
+
+import os
+
+# 项目的根目录
+path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
+
+# path of embedding
+path_embedding_random_char = path_root + '/data/embeddings/term_char.txt'
+path_embedding_bert = path_root + '/data/enbeddings/bert/'
+path_embedding_vector_word2vec = path_root + '/data/embeddings/'
+
+# classify data of baidu qa 2019
+path_baidu_qa_2019_train = path_root + '/data/baidu_qa_2019/baike_qa_train.csv'
+path_baidu_qa_2019_valid = path_root + '/data/baidu_qa_2019/baike_qa_valid.csv'
+
+# fast_text config
+path_fast_text_model = path_root + '/data/model/fast_text/'
+path_model_fast_text_baiduqa_2019 = path_root + '/data/model/fast_text/model_fast_text.f5'
--- a/keras_textclassification/data/init.py
+++ b/keras_textclassification/data/init.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:50
+# @author   :Mo
+# @function :
--- a/keras_textclassification/data/baidu_qa_2019/baike_qa_train.csv
+++ b/keras_textclassification/data/baidu_qa_2019/baike_qa_train.csv
@ -0,0 +1,100 @@
+label,ques
+教育,人站 在 地球 上 为什么 没有 头朝下 的 感觉
+娱乐,我 的 小 baby
+娱乐,请问 这起 交通事故 是 谁 的 责任 居多 小车 和 摩托车 发生 事故 在 无 红绿灯
+电脑,松本 面板 可以 配 什么 品牌 的 超五类 模块
+生活,请教 怎么 能 很快 能 洗 干净 猪肠 有 什么 方法
+健康,毛孔 粗大 怎么办 我 脸上 长 豆豆 出 油 额头 鼻子 上 脸上 都 有 毛孔 很
+健康,黑河市 治疗 癫痫 最 权威 的 医院 有 哪些
+娱乐,替人 垂泪 到 天明  商业 用语 一
+娱乐,本期 铁胆 罗马 强势 单 3
+健康,降压药 的 选择 不 知道 波依定 和 洛活 喜 两种 药 各有 什么 特点 适合 哪种 人服
+游戏,有 6 区 无尽 之海众 神之子 工会 的 吗 你们 会 怎么 了 看不见 人 了 解散 了
+商业,老师 好 青 青稞酒 明天 再 跌 就 可以 建仓 了 吗 谢谢
+教育,onthesouthofthecity 是 在 城内 还是 成外
+教育,古代 官职 升迁 又 怎么 说 的 比如 拜  最好 还有 例句
+教育,在 离心管 中 加入 水 和 密度 小于 水 的 小球 在 倾斜 的 离心管 中 加入 水 和 密度 小
+生活,邯郸市 哪里 有 卖纳威 沙发
+社会,请 各位 老师 讲解 如图所示  A B 是 圆 的 直径 的 两端 小张 在 A 点
+电脑,Word 文档 一页 半 文字 怎样 调整 使 其 真 好 布满 两页
+娱乐,出 大事 了 出 大事 了 你 知道 吗 武汉 江夏 公安分局 接到 群众 举报电话 说 在 天
+教育,简单 的 地理 基础知识 填空 要 100 正确 一 考点 知识 清单 一 地图 1
+健康,下体 很痒 怎么回事 是 什么 原因 呢 长 了 一些 像 疹子 一样 的 东西 正好 我
+电子,为什么 手机 连不上 wifi 我 的 手机 型号 是 lephone1800
+健康,请问 吃 紧急 避孕药 月经 推迟 多久 才 是 正常 的 我 因为 吃 了 那药 月经 已经
+健康,霉菌性 阴道炎 会上 行 感染 胎儿 吗 目前 未确定 怀孕 但 怀孕 可能性 较大 我 1
+教育,公鸡 和 母鸡 是 怎么 交配 的 如何 受精 产生 小鸡
+电脑,word 中想 只 插入 一个 键 箭头 如何 居中 我 在 word 中 我 想 打 一个
+健康,怎样 快点 长高 我 160 想长 高 5 厘米 怎么 可以 长高
+娱乐,根据 我 的 中文名 来 取个 英文名 苏金凤 根据 我 的 中文名 来 取个 英文名 本人 苏
+生活,16 个 月 男 宝宝 长 8 颗 牙齿  我家 宝宝 16 个 月 1 才 长 8 颗 牙齿
+娱乐,求解 这是 什么 东西 啊
+游戏,DZ 怎么 才能 在 大 战场 立脚 我 的 盗贼 无 BUFF3700 血 为了 拿 2 把
+育儿,宝宝 腹泻 一周 吃 了 药 不 见效 用换 腹泻 奶粉 吗 已有 宝宝 年龄 1 岁 2
+教育,一元 一次方程 车间 有 28 名 工人 生产 一种 螺栓 和 螺帽 一个 螺栓 的 两
+娱乐,电影票房 是 怎样 计算 的
+教育,将 SO2 气体 分别 通入 什么 溶液 中 能 出现 下述 现象  紫色 变 红色  紫
+社会,请问 什么 工作 或 职业 最 赚钱
+健康,重庆 哪里 可以 治疗 尖锐湿疣 能 不 复发
+教育,西方 经济学 论文 代发 怎么样 各位 都 有 这方面 的 经验 吗 都 尽量 说 说 吧
+游戏,上古 运动会 有人 作弊 吗 我 知道 上古 运动会 的 时间 是 下午 的 3 点 5 点 和 7 点
+汽车,大纳智捷 生活馆 是 怎样 看车 的
+娱乐,期任 九 108 元 初单 欢迎 指教  1 年轻人 vs 利物浦 2 乌迪内 vs 安
+游戏,真有 这么 巧 的 事 建立 个小弓 和 一 转前 的 武器 前 4 手上 了 3 个 G10 老婆
+电脑,怎么 安装 双系统 知道 的 进 高分 我 现在 用 的 是 XP 系统 但是 我 还 想安
+烦恼,九年 级 相似 图形 在 梯形 ABCD 中 AD  BC E 是 AB 上 的 一点
+商业,华夏 公司 网上交易 使用 的 广发卡 是 广发 理财 通卡 吗 如 题 广发 理财 通卡
+文化,历史 上 西施 活 了 多少岁
+社会,我 儿子 从 高一上 学期 与 班 中 的 一位 女 体育 特长生 关系密切 后 发现 他 对学
+游戏,请问 11 服 的 钢铁 原石值 多少 钱 呀
+生活,宝宝 这是 鼻炎 吗 宝宝 鼻塞 半个 月 刚 开始 的 鼻涕 是 白色 带 黄 后来 是 黄
+生活,用 了 祛痘 洗面奶 反而 脸上 长痘痘 了 怎么回事 啊
+文化,项链 的 作者 是 谁
+生活,不 睡 枕头 对 颈椎病 好 吗 他们 说 有 颈椎病 的 人 最好 不 睡 枕头
+烦恼,怎样才能 使 自己 的 内心 强大 起来
+烦恼,他 这样 是 爱我吗 我们 是 经人介绍 认识 认识 到 现在 已经 有 三个 月 了 自
+教育,竹炭 煮水 的 好处 在 水壶 里 放 两大片 洗净 的 竹 碳 坐水时 一起 煮 这样 的 开
+健康,上海 哪家 医院 治疗 输卵管 堵塞 好
+生活,冬天 进补 好 呢 还是 夏天 好
+生活,夏季 如何 进行 饮食 调养 养生
+教育,请问 钨 钼 钨 铝丝 的 居里点 是 多少
+游戏,我 写 了 我 的 姓名 和 身份证 号码 通过 哪个 未成年 许可 怎么 像是 没 反应 的 啊
+健康,为什么 会得 癔症
+健康,形成 胆结石 的 原因 是
+生活,外贸服装 哪里 进货 像 第一 街  署光里 那些 外贸服装 都 从 哪里 进货
+医疗,辐射 防护 该 怎么 做 呢
+烦恼,我 好 心疼 哦  我 听说 喜欢 的 人 已经 有 心上人 了 我 好 心疼 哦 你们 可
+商业,我 就 贷款 买 基金 了 今年 贷款 买 了 好多 的 基金 啊 收益 25 还 多 从 八月 到
+教育,鬣狗 和 野狗 的 外观 和 毛色 都 差不多 为何 名称 不同
+商业,同花顺 2007 标准版 和 同花顺 盛大 密宝 有 什么 区别 同花顺 里 有 同花顺 2
+社会,T
+电脑,主板 启动 电脑 想 问 一下 在 主机 开关 出现 故障 时 直接 利用 主板 如何 启动
+生活,的 鼻子 很大 而且 没有 鼻梁 难看 死 了 有没有 办法 变小 或者 鼻梁 变挺
+健康,老是 感到 胸闷 是 怎么回事 我 40 岁 老是 感到 胸闷 喘 不过 气来 . 有
+生活,今天 我 MM 被 诊断 为 肺炎 虽然 她 确实 高烧 可是 并 没有 咳嗽 症状 这个
+游戏,电信 1 黄金 车身 1 型 的 价位 在 多少
+电脑,通过 qq 如何 控制 对方 电脑
+商业,这 两天 嘉实 300 和 南方 稳健 都 要 大 比例 分红 了 . 分完 净值 都 在 1 左右 了
+教育,穷 在 句 中 的 意思 穷追猛打 中穷 是 极端 的 意思 还是 彻底 的 意思
+电脑,电子词典 选 什么 牌子 的 五一 想 入手 一款 电子词典 该选 什么 牌子 的 要
+游戏,谁 帮 我 捉 1 只 火暴 萝卜 和 1 个 火爆 樱桃 是 火暴 的
+健康,女人体 香是 怎么回事
+商业,有关 专利发明 奖金 是否 要 缴纳 个人所得税 哪位 大仙 知道 公司 对 职务 发明
+文化,请问 这句 莫名其妙 的 诗句 是 什么 意思 完整 的 如下 太窥门 夹豆 丫洗 盆
+,微博 是不是 可以 有个 类似 于 qq 空间 的 功能 谁 来 微博 主页 了 就 发个 消息
+游戏,55 级 的 菩提 宝杖 从 哪 打 出来 的
+游戏,都 是 新手 问题 帮助 新手 的 非常简单 新手 请 回答 问题 一 说出 一个 怪
+社会,什么 是 社会主义 社会主义 的 本质 是 什么
+健康,重庆 大坪 医院 打胎 大概 多少 钱
+游戏,魔 转物 那个 技能 几级 可以 学 哪里 学 啊
+游戏,全力 火弓 有 前途 吗  我 是 15 级 全力 火弓  请问 各位 大虾 有 前途 吗
+游戏,地下 钟乳洞 地宫 怎么 去 要 在 哪里 接 任务 还要 打 什么 东西 吗 哪里
+游戏,3 个 战场 的 双倍 声望 时间 请 说 的 详细 些
+生活,从 上海 森勤 国际 大酒店 到 上海 维也纳 国际 大酒店 有多远
+游戏,淡谈 真三武名 系统 取消 后 的 波澜 关于 这次 武名 系统 取消 同意 的 占 大多数
+生活,为什么 上海 地铁 单单 只有 上海 马戏 城站 的 站台 上 有门 同 标题
+商业,关于 基金 啥 叫 前端 申购 代码 后 端 申购 代码 . 有 啥 区别 . 有 甚么 作用 .
+娱乐,小 联赛 足彩 延期 开奖 推迟 比赛 或 310 全算 对 新浪 体育讯 北京 时间 12
+游戏,进入 易玩通 网站 怎么 看不见 我 还有 多少 点 呀 还是 0 点 去 哪能 看到
+生活,北京 哪里 礼物 便宜 毛绒玩具 哦 质量 也 要 好
+娱乐,大家 都 买 了 多少 钱
--- a/keras_textclassification/data/baidu_qa_2019/baike_qa_valid.csv
+++ b/keras_textclassification/data/baidu_qa_2019/baike_qa_valid.csv
@ -0,0 +1,100 @@
+label,ques
+烦恼,请问 深入骨髓 地 喜欢 一个 人 怎么办 我 不能 确定 对方 是不是 喜欢 我 我 却 想
+游戏,我 登陆 诛仙 2 时 总 说 我 账号密码 错误 但是 我 打 的 是 正确 的 就算 不 对 我
+游戏,斩 魔仙 者 称号 怎么 得来 的
+商业,有 哪位 好心人 上传 一份 女 衬衫 的 加拿大 海关 发票 给 我 看 一下 塞 多谢 了
+娱乐,想 去 澳州 看 亲戚 二个 星期 怎么 去 求教
+生活,送 男生 什么 生日礼物 好 呢  思考
+教育,英语 音节 划分 问题 在 重读 和 非重 读音节 的 相邻 处 只有 一个 辅 字组 时 如果
+教育,厂房 内有 吊车 起吊 高度 怎么 定 厂房 内有 吊车 牛腿 高度 怎么 定 根据
+育儿,你好 请问 有 疼痛 晕厥 史 的 产妇 可以 顺产 吗
+商业,投资 个人 理财产品 需注意 哪些 问题
+健康,女性 经常 有作 乳房 自查 是否 不用 每年 上 医院 作 体检 了 经常 自查 乳房 的
+文化,我爱你 古文 怎么 说
+教育,为什么 没有 副 总书记 一职
+教育,已知 a b 都 是 锐角 tana 4 3 tanb 1 7 求 a
+娱乐,请教 一下 这是 什么 植物 谢谢
+游戏,家族 建立 必须 等级 我 看 新浪 任务 说 家族 建立 的 必须 条件 是 LV50
+健康,牙龈 包住 牙齿 怎么办
+教育,加强 调 的 着重号 有 什么 作用 列宁 指出  物质 是 标志 客观实在 的 哲
+电脑,请问 这样 一个 配置 需要 多大 的 电源 cpu AMD 羿龙 X38450
+教育,翻译 3Shedescribestheexperienceof15
+娱乐,网络 上 真的 没有 帅哥美女 吗 突然 间 想到 的  以前 在 哪里 看到 过 是 这么
+生活,耳朵 怎么 了 宝宝 晚上 睡觉 总是 翻来覆去 的 仿佛 睡 不 塌实 而且 每次 翻身
+生活,北京 房产证 过户 手续 怎么办 夫妻 两人 一人先 走 房产 过户 另 一个 人 手
+健康,面烧糊 了 还 能 吃 吗 糊味 明显 吃 了 对 健康 影响 大 吗
+社会,刚 毕业 的 大学生 怎样 考 职称 吗 求教 刚 毕业 的 大学生 可以 怎样 考 职称
+娱乐,求 好看 的 动漫
+游戏,为什么 我起 名字 老是 不 合法 这个 游戏 不 可以 起 带有 特殊符号 的 名字 吗
+健康,你 相信 这个 世上 有 真 爱 吗 我 觉得 人 在 一起 久 了 就 会 有 感情 你们 说 呢
+教育,我家 公猫 为什么 晚上 乱叫 呀
+健康,你好 我家 宝宝 7 多月 这 几天 拉肚子 去 医院 检查 说 是 细菌 感染 肠炎 吃
+游戏,关于 安装 台服 的 问题 等 CWOW 遥遥无期 鄙人 也 花 了 一天 时间 下载 了 台
+娱乐,李克勤 什么 歌 好听
+健康,血压 140 105 我 18 岁 征兵 体检 时 发现 有 高血压 30 岁 时 偶尔 会
+生活,有人 知道 宝宝 补锌 产品 排行榜 么 知道 的 可以 说 说 建议 上面 什么 产品 比
+生活,春装 秋装 区别 我 想 知道 春装 和 秋装 的 区别 要 详细 地说
+生活,全国 的 邮政 小包 价格 一样 吗
+教育,我 在 山西 冬天 榕树 怎样 养植 榕树 一直 掉 叶 现在 没剩 几片 了
+教育,怎样 才 能够 进入 外交部 工作 呢 有 什么 要求 吗 必须 是 相关 的 的 专业 才 可
+商业,广发 货币基金 转聚丰 需要 几天
+生活,上海 简单 有效 的 祛斑 方法 祛斑 需要 多少 钱
+游戏,怀旧 我 想 打 阿鲁巴 斯 要 拿 护士 帽 吗 怎么 个 流程 话 说 我 玩 这么久 没直
+电子,为什么 低端 的 数码相机 采用 CMOS 传感器 高级 一点 采用 CCD 而 更
+游戏,攻击 别国 占领 陆地 的 结果 是 如何 计算 的 攻击 别国 占领 陆地 的 结果 是 如何
+电脑,XP 关机时 出现 结束 SAMPLE 程序 对话框 怎么回事 我 的 Wi
+游戏,天府 情缘 出 问题 了 晚上 天府 都 进不去 了 给 个 说法 哦 我 还 在 杀怪 哈死 了 掉
+娱乐,任九 让 我 优 让 我 喜   一次次 买彩 一次次 倾听 那 比分 一次
+电脑,汉城 什么 时候 改名 为首 尔 为什么
+电脑,UPS 电源 工作 原理
+电脑,我 在 上 注册 在 中文 香港 界面 就 可以 用 用户名 登录 成功 在 英文 界面 就 说
+烦恼,11 月 17  q 毖  稚齷了 U
+商业,站 在 2995 以上 的 高岗 我 全副武装 满仓  站 在 2900 以上 的 高岗
+,急需
+游戏,关于 BL 我们 队伍 平均 70 级 2 魔 1 传 1 弓 1 格 2 次 BL 都 过 不了
+文化,歇后语 打破 脑壳 不 叫 痛
+娱乐,大家 发现 了 一个 问题 了 吗  关于 中奖  江西 的 彩民 每期 的 投注额 绝
+游戏,使命 召唤 2 的 问题 帮帮我 谢谢 了 从 新浪 下载 的 使命 召唤 2 使用 了 XEZ
+游戏,回归 魔力 . 问 练功 点 . 回归 魔力 电信 时 长 巨蟹 的 . 请问 112 的 敏 魔 要
+社会,有限公司 可以 不设 股东会 把 董事会 作为 权力 机构 吗 如 题
+教育,2009 年 湖南 会计人员 继续 教育 是 什么 时候
+育儿,我 的 奶水 不够 吃 怎么 增加 奶水
+教育,英语 What sup  是 什么 意思
+烦恼,jiaoyu 如何 培养 学生 阅读 能力
+电脑,CAD 软件 的 角度 取值 为 整数 1 度 2 度 3 度 4 度 没有 2.1 度
+社会,物流 公司 发货 不 按照 协议 发货 照成 货物 损坏 怎么 赔偿
+社会,会 开车 的 好处 我 是 个 大学生 请问 有 驾驶证 对 以后 找 工作 帮助 大 吗
+文化,何谓 普普 来历
+游戏,问道 游戏 有 什么 办法 可以 快速 刷 友好度 的
+生活,市区 哪里 还有 卖 北京 冰糖葫芦 的 抓狂  抓狂  抓狂  抓狂
+生活,请问 各位 大虾 北京 哪里 有 卖 橄榄油 的 我 想 买 不知 哪里 有 卖 的 那位
+游戏,50 级 最好 的 双手 武器 想 在 50 级 的 战场 呆 一段时间 所以 求个 50
+商业,1 . 大宗 交易 的 金额 与 数量 有何 规定  一 A股 交易 数量 在 50 万股
+生活,谁 用 过 水宜 生杯  真的 是 那么 有 作用 吗 我 看电视 上 的 宣传 广告 觉
+商业,退税 营业税 城建税 教育费 附加 所得税 分别 怎么 做 帐
+游戏,我 这样 的 配置 会 不会 卡 游戏 登 不 上去 没试 过 不 知道 XP2500 O
+生活,深圳 至 赣州 怎样 走 请问 深圳 开车 到 赣州 应该 怎样 走 啊 请 各位 可以 详细 告
+健康,遵义市 治疗 女性 癫痫 需要 注意 患者 的 哪些
+健康,工作 太累会 造成 便秘 吗 整天 干重 体力劳动 大便 老是 不通 排不净
+游戏,牛 C 进来 回答 下 上次 在 4 区 看到 个 会后 中投 后 3 分 的 C 不 知道 怎么 会 的
+商业,帮 分析 600527 江南 高纤 今天 是否 为 上涨 前 的 蓄势 调整 吗
+体育,郝海东 现在 干什么 了 怎么 也 不 听 报道 了
+健康,怀孕 什么 时候 能 感觉 到 初期 症状 都 有 什么 怀孕 什么 时候 能 感觉 到 初
+生活,吃 什么 生精 比较 快 吃 什么 生精快 而且 易 食用
+商业,清泉 点评 之 39 角度 与 变线 以前 是 准备 放在 80 讲 以后 进行 的 也 就
+游戏,学 手机游戏 开发 有 前途 吗 悦成 学校 好 么 现在 网络 上 关于 就业 关于 手
+生活,有 谁 知道 妆点 一生 的 洗发水 去屑 效果 怎么样
+电子,诺基亚 E63 最低 电量 指示 空格 还是 一 格 是不是 没 电 无法 开机 的 时候 摁 开
+教育,山东 最好 的 厨师 学校
+游戏,末日 轻装 套 3 章会涨 还是 会降 幅度 大不大 懂 市场 的 哥哥 姐姐 能 说 说 吗
+娱乐,256 元冲 14 场 火锅 而 去 01 切尔西 VS 维 拉 3102 西汉姆 VS 雷
+游戏,用 STEAM 打 1.6 进有 CD 的 服务器 进去 后 一会 就 不能 动 了 请问 怎
+生活,怎么 把 脸 洗 干净 我 怎么 觉得 每天 我 的 脸 洗完 后 还 不是 很 干净 皮肤 也
+教育,35 年前 的 今天 一个 伟人 过世 了 请 您 在 此 说 一句 心里话
+烦恼,初恋 小 毛孩 的 疑惑 她 答应 做 我 女朋友 却 不愿 公开 为什么 呢 是因为 怕
+体育,请 输入您 的 问题 ... 山东 鲁能泰山 足球队 的 世界 俱乐部 排名 是 多少
+游戏,诛 仙 在 哪里 领 激活码 啊 在 哪里 领 激活码 啊
+健康,脖子 长 10CM 算长 还是 短 我 15 岁 男生 身高 167CM 脸圆
+健康,拉肚子 以后 身体虚弱 应该 吃些 什么 东西 调养 有 什么 应该 注意 的 不
+生活,生活 饮食习惯 对 人体 健康 的 影响 饮食 的 健康 会 对 人体 产生 怎样 的 影响
+生活,求 推荐 减肥药 各位 泪  泪  泪  泪
--- a/keras_textclassification/data/embeddings/bert/useless.txt/useless.txt
+++ b/keras_textclassification/data/embeddings/bert/useless.txt/useless.txt
@ -0,0 +1 @@
+useless
--- a/keras_textclassification/data/embeddings/term_char.txt
+++ b/keras_textclassification/data/embeddings/term_char.txt
--- a/keras_textclassification/data/model/fast_text/useless.txt
+++ b/keras_textclassification/data/model/fast_text/useless.txt
@ -0,0 +1 @@
+useless
--- a/keras_textclassification/etl/init.py
+++ b/keras_textclassification/etl/init.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:50
+# @author   :Mo
+# @function :
--- a/keras_textclassification/etl/text_preprocess.py
+++ b/keras_textclassification/etl/text_preprocess.py
@ -0,0 +1,145 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/5 21:36
+# @author   :Mo
+# @function :data utils of text classification
+
+from keras_textclassification.conf.path_config import path_fast_text_model
+path_fast_text_model_vocab2index = path_fast_text_model + 'vocab2index.json'
+path_fast_text_model_label2index = path_fast_text_model + 'label2index.json'
+
+import pandas as pd
+import numpy as np
+import jieba
+import json
+import re
+import os
+
+
+def extract_chinese(text):
+    """
+      只提取出中文、字母和数字
+    :param text: str, input of sentence
+    :return: 
+    """
+    chinese_exttract = ''.join(re.findall(u"([\u4e00-\u9fa5A-Za-z0-9@.])", text))
+    return chinese_exttract
+
+
+def read_and_process(path):
+    """
+      读取文本数据并
+    :param path: 
+    :return: 
+    """
+    with open(path, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+        lines_x = [extract_chinese(line.split(",")[0]) for line in lines]
+        line_y = [extract_chinese(line.split(",")[1]) for line in lines]
+        return lines_x, line_y
+
+
+def preprocess_baidu_qa_2019(path):
+    x, y, x_y = [], [], []
+    x_y.append('label,ques\n')
+    with open(path, 'r', encoding='utf-8') as f:
+        while True:
+            line = f.readline()
+            try:
+                line_json = json.loads(line)
+            except:
+                break
+            ques = line_json['title']
+            label = line_json['category'][0:2]
+            line_x = " ".join([extract_chinese(word) for word in list(jieba.cut(ques, cut_all=False, HMM=True))]).strip().replace('  ',' ')
+            line_y = extract_chinese(label)
+            x_y.append(line_y+','+line_x+'\n')
+    #         x.append(line_x)
+    #         y.append(line_y)
+    # return x, y
+    return x_y
+
+
+
+def save_json(json_, path):
+    """
+      保存json，
+    :param json_: json 
+    :param path: str
+    :return: None
+    """
+    with open(path, 'w', encoding='utf-8') as fj:
+        fj.write(json.dumps(json_))
+
+
+def get_json(path):
+    """
+      获取json，只取第一行
+    :param path: str
+    :return: json
+    """
+    with open(path, 'r', encoding='utf-8') as fj:
+        model_json = json.loads(fj.readlines()[0])
+    return model_json
+
+
+class PreprocessText:
+    def __init__(self):
+        gg = 0
+
+    @staticmethod
+    def prereocess_idx(pred):
+        if os.path.exists(path_fast_text_model_label2index):
+            pred_i2l = {}
+            l2i_i2l = get_json(path_fast_text_model_label2index)
+            i2l = l2i_i2l['i2l']
+            for i in range(len(pred)):
+                pred_i2l[i2l[str(i)]] = pred[i]
+            pred_i2l_rank = [sorted(pred_i2l.items(), key=lambda k: k[1], reverse=True)]
+            return pred_i2l_rank
+        else:
+            raise RuntimeError("path_fast_text_model_label2index is None")
+
+    def preprocess_baidu_qa_2019_idx(self, path, embed):
+        data = pd.read_csv(path)
+        ques = data['ques'].tolist()
+        label = data['label'].tolist()
+        ques = [str(q).upper() for q in ques]
+        label = [str(l).upper() for l in label]
+        label_set = set(label)
+        count = 0
+        label2index = {}
+        index2label = {}
+        for label_one in label_set:
+            label2index[label_one] = count
+            index2label[count] = label_one
+            count = count + 1
+        l2i_i2l = {}
+        l2i_i2l['l2i'] = label2index
+        l2i_i2l['i2l'] = index2label
+        save_json(l2i_i2l, path_fast_text_model_label2index)
+
+        x = []
+        for que in ques:
+            que_embed = embed.sentence2idx(que)
+            x.append(que_embed)
+        label_zo = []
+        for label_one in label:
+            label_zeros = [0] * len(l2i_i2l['l2i'])
+            label_zeros[l2i_i2l['l2i'][label_one]] = 1
+            label_zo.append(label_zeros)
+
+        return np.array(x), np.array(label_zo)
+
+
+if __name__=="__main__":
+    # path = 'Y:/BaiduNetdiskDownload/DataSet/corpus/baike_qa2019/'
+    # name = 'baike_qa_train.json'
+    # # x, y = preprocess_baidu_qa_2019(path + name)
+    # x_y = preprocess_baidu_qa_2019(path + name)
+    # with open(name.replace('.json', '.csv'), 'w', encoding='utf-8') as f:
+    #     f.writelines(x_y)
+
+    from keras_textclassification.conf.path_config import path_baidu_qa_2019_valid
+    pt = PreprocessText()
+    pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_valid)
--- a/keras_textclassification/m01_FastText/init.py
+++ b/keras_textclassification/m01_FastText/init.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:51
+# @author   :Mo
+# @function :
--- a/keras_textclassification/m01_FastText/graph.py
+++ b/keras_textclassification/m01_FastText/graph.py
@ -0,0 +1,37 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:51
+# @author   :Mo
+# @function :graph of fasttext
+# @paper: Bag of Tricks for Efﬁcient Text Classiﬁcation(https://arxiv.org/abs/1607.01759)
+
+
+from keras_textclassification.base.graph import graph
+from keras.layers import Dense
+from keras.layers import GlobalMaxPooling1D
+from keras.models import Model
+
+
+class FastTextGraph(graph):
+    def __init__(self, hyper_parameters):
+        """
+            初始化
+        :param hyper_parameters: json，超参
+        """
+        super().__init__(hyper_parameters)
+
+
+    def create_model(self, hyper_parameters):
+        """
+            构建神经网络
+        :param hyper_parameters:json,  hyper parameters of network
+        :return: tensor, moedl
+        """
+        super().create_model(hyper_parameters)
+        embedding = self.word_embedding.output
+        x = GlobalMaxPooling1D()(embedding)
+        output = Dense(self.label, activation=self.activate_classify)(x)
+        self.model = Model(inputs=self.word_embedding.input, outputs=output)
+        self.model.summary(120)
+
+
--- a/keras_textclassification/m01_FastText/predict.py
+++ b/keras_textclassification/m01_FastText/predict.py
@ -0,0 +1,57 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:51
+# @author   :Mo
+# @function :train of fast text with baidu-qa-2019 in question title
+
+
+import numpy as np
+
+from keras_textclassification.conf.path_config import path_embedding_random_char
+from keras_textclassification.conf.path_config import path_model_fast_text_baiduqa_2019
+from keras_textclassification.etl.text_preprocess import PreprocessText
+from keras_textclassification.m01_FastText.graph import FastTextGraph
+
+if __name__=="__main__":
+    hyper_parameters = { 'model': {   'label': 17,
+                                     'batch_size': 256,
+                                     'embed_size': 300,
+                                     'filters': [3, 4, 5],
+                                     'kernel_size': 3,
+                                     'channel_size': 1,
+                                     'dropout': 0.5,
+                                     'decay_step': 100,
+                                     'decay_rate': 0.9,
+                                     'epochs': 20,
+                                     'len_max': 50,
+                                     'vocab_size': 20000,
+                                     'lr': 1e-4,
+                                     'l2': 1e-9,
+                                     'activate_classify': 'softmax',
+                                     'embedding_type': 'random',
+                                     'is_training': False,
+                                     'model_path': path_model_fast_text_baiduqa_2019,},
+                         'embedding':{ 'embedding_type': 'random',
+                                      'corpus_path': path_embedding_random_char,
+                                      'level_type': 'char',
+                                      'embed_size': 300,
+                                      'len_max': 50,},
+                         }
+    # ns = np.array([1,2,3,4])
+    # print(type(ns))
+    pt = PreprocessText
+    graph = FastTextGraph(hyper_parameters)
+    graph.load_model()
+    ra_ed = graph.word_embedding
+    ques = '你好呀'
+    ques_embed = ra_ed.sentence2idx(ques)
+    pred = graph.predict(np.array([ques_embed]))
+    pre = pt.prereocess_idx(pred[0])
+    print(pre)
+    while True:
+        print("请输入: ")
+        ques = input()
+        ques_embed = ra_ed.sentence2idx(ques)
+        pred = graph.predict(np.array([ques_embed]))
+        pre = pt.prereocess_idx(pred[0])
+        print(pre)
--- a/keras_textclassification/m01_FastText/train.py
+++ b/keras_textclassification/m01_FastText/train.py
@ -0,0 +1,50 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:51
+# @author   :Mo
+# @function :train of fast text with baidu-qa-2019 in question title
+
+from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
+from keras_textclassification.conf.path_config import path_embedding_random_char
+from keras_textclassification.conf.path_config import path_model_fast_text_baiduqa_2019
+from keras_textclassification.etl.text_preprocess import PreprocessText
+from keras_textclassification.m01_FastText.graph import FastTextGraph as Graph
+
+if __name__=="__main__":
+    hyper_parameters = {'model': {   'label': 17,
+                                     'batch_size': 256,
+                                     'embed_size': 300,
+                                     'filters': [2, 3, 4],
+                                     'kernel_size': 3,
+                                     'channel_size': 1,
+                                     'dropout': 0.5,
+                                     'decay_step': 100,
+                                     'decay_rate': 0.9,
+                                     'epochs': 20,
+                                     'len_max': 50,
+                                     'vocab_size': 20000, #这里随便填的，会根据代码里修改
+                                     'lr': 1e-3,
+                                     'l2': 1e-6,
+                                     'activate_classify': 'softmax',
+                                     'embedding_type': 'random',
+                                     'is_training': True,
+                                     'model_path': path_model_fast_text_baiduqa_2019,},
+                        'embedding':{ 'embedding_type': 'random',
+                                      'corpus_path': path_embedding_random_char,
+                                      'level_type': 'char',
+                                      'embed_size': 300,
+                                      'len_max': 50,},
+                         }
+    graph = Graph(hyper_parameters)
+    ra_ed = graph.word_embedding
+    pt = PreprocessText()
+    x_train, y_train = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_train, ra_ed)
+    x_val, y_val = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_valid, ra_ed)
+    print(len(y_train))
+    graph.fit(x_train, y_train, x_val, y_val)
+
+# 1425170/1425170 [==============================] - 83s 58us/step - loss: 0.9383 - acc: 0.7106 - val_loss: 2.4205 - val_acc: 0.5029
+# Epoch 00001: val_loss improved from inf to 2.42050, saving model to D:\workspace\pythonMyCode\django_project\ClassificationTextChinese/data/model/fast_text/model_fast_text.f5
+# Epoch 2/20
+# 验证集准确率50%左右
+# time时间大约在2*4轮=8分钟左右
--- a/keras_textclassification/m02_TextCNN/init.py
+++ b/keras_textclassification/m02_TextCNN/init.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/7 22:09
+# @author   :Mo
+# @function :
--- a/keras_textclassification/m02_TextCNN/graph.py
+++ b/keras_textclassification/m02_TextCNN/graph.py
@ -0,0 +1,56 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:51
+# @author   :Mo
+# @function :graph of base
+
+
+from keras.layers import Concatenate
+from keras.layers import Conv2D, MaxPool2D
+from keras.layers import Dense
+from keras.layers import Dropout
+from keras.layers import Flatten
+from keras.layers import Reshape
+from keras.models import Model
+
+from keras_textclassification.base import graph
+
+
+class TextCNNGraph(graph):
+    def __init__(self, hyper_parameters):
+        """
+            初始化
+        :param hyper_parameters: json，超参
+        """
+        super().__init__(hyper_parameters)
+
+    def create_model(self, hyper_parameters):
+        """
+            构建神经网络
+        :param hyper_parameters:json,  hyper parameters of network
+        :return: tensor, moedl
+        """
+        super().create_model(hyper_parameters)
+        embedding = self.word_embedding.output
+        embedding_reshape = Reshape((self.len_max, self.embed_size, 1))(embedding)
+        # 提取n-gram特征和最大池化， 一般不用平均池化
+        conv_pools = []
+        for filter in self.filters:
+            conv = Conv2D(filters = self.kernel_size,
+                          kernel_size = (filter, self.embed_size),
+                          padding = 'valid',
+                          kernel_initializer = 'normal',
+                          activation = 'relu',
+                          )(embedding_reshape)
+            pooled = MaxPool2D(pool_size = (self.len_max - filter + 1, 1),
+                               strides = (1, 1),
+                               padding = 'valid',
+                               )(conv)
+            conv_pools.append(pooled)
+        # 拼接
+        x = Concatenate(axis=1)(conv_pools)
+        x = Flatten()(x)
+        x = Dropout(self.dropout)(x)
+        output = Dense(units=self.label, activation=self.activate_classify)(x)
+        self.model = Model(inputs=self.word_embedding.input, outputs=output)
+        self.model.summary(120)
--- a/keras_textclassification/m02_TextCNN/predict.py
+++ b/keras_textclassification/m02_TextCNN/predict.py
@ -0,0 +1,57 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:51
+# @author   :Mo
+# @function :train of fast text with baidu-qa-2019 in question title
+
+
+import numpy as np
+
+from keras_textclassification.conf.path_config import path_embedding_random_char
+from keras_textclassification.conf.path_config import path_model_fast_text_baiduqa_2019
+from keras_textclassification.etl.text_preprocess import PreprocessText
+from keras_textclassification.m02_TextCNN.graph import TextCNNGraph as Graph
+
+if __name__=="__main__":
+    hyper_parameters = { 'model': {   'label': 17,
+                                     'batch_size': 256,
+                                     'embed_size': 300,
+                                     'filters': [2, 3, 4],
+                                     'kernel_size': 300,
+                                     'channel_size': 1,
+                                     'dropout': 0.5,
+                                     'decay_step': 100,
+                                     'decay_rate': 0.9,
+                                     'epochs': 20,
+                                     'len_max': 50,
+                                     'vocab_size': 20000,
+                                     'lr': 1e-4,
+                                     'l2': 1e-9,
+                                     'activate_classify': 'softmax', # 还可以填'random'、 'bert' or 'word2vec"
+                                     'embedding_type': 'random',
+                                     'is_training': False,
+                                     'model_path': path_model_fast_text_baiduqa_2019,},
+                         'embedding':{ 'embedding_type': 'random',
+                                      'corpus_path': path_embedding_random_char,
+                                      'level_type': 'char',
+                                      'embed_size': 300,
+                                      'len_max': 50,},
+                         }
+    # ns = np.array([1,2,3,4])
+    # print(type(ns))
+    pt = PreprocessText
+    graph = Graph(hyper_parameters)
+    graph.load_model()
+    ra_ed = graph.word_embedding
+    ques = '你好呀'
+    ques_embed = ra_ed.sentence2idx(ques)
+    pred = graph.predict(np.array([ques_embed]))
+    pre = pt.prereocess_idx(pred[0])
+    print(pre)
+    while True:
+        print("请输入: ")
+        ques = input()
+        ques_embed = ra_ed.sentence2idx(ques)
+        pred = graph.predict(np.array([ques_embed]))
+        pre = pt.prereocess_idx(pred[0])
+        print(pre)
--- a/keras_textclassification/m02_TextCNN/train.py
+++ b/keras_textclassification/m02_TextCNN/train.py
@ -0,0 +1,54 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:51
+# @author   :Mo
+# @function :train of fast text with baidu-qa-2019 in question title
+
+import time
+
+from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
+from keras_textclassification.conf.path_config import path_embedding_random_char
+from keras_textclassification.conf.path_config import path_model_fast_text_baiduqa_2019
+from keras_textclassification.etl.text_preprocess import PreprocessText
+from keras_textclassification.m02_TextCNN.graph import TextCNNGraph as Graph
+
+if __name__=="__main__":
+    hyper_parameters = {'model': {   'label': 17,
+                                     'batch_size': 256,
+                                     'embed_size': 300,
+                                     'filters': [2, 3, 4],
+                                     'kernel_size': 300,
+                                     'channel_size': 1,
+                                     'dropout': 0.5,
+                                     'decay_step': 100,
+                                     'decay_rate': 0.9,
+                                     'epochs': 20,
+                                     'len_max': 50,
+                                     'vocab_size': 20000, #这里随便填的，会根据代码里修改
+                                     'lr': 1e-3,
+                                     'l2': 1e-6,
+                                     'activate_classify': 'softmax',
+                                     'embedding_type': 'random', # 还可以填'random'、 'bert' or 'word2vec"
+                                     'is_training': True,
+                                     'model_path': path_model_fast_text_baiduqa_2019,},
+                        'embedding':{ 'embedding_type': 'random',
+                                      'corpus_path': path_embedding_random_char,
+                                      'level_type': 'char',
+                                      'embed_size': 300,
+                                      'len_max': 50,},
+                         }
+    time_start  = time.time()
+    graph = Graph(hyper_parameters)
+    ra_ed = graph.word_embedding
+    pt = PreprocessText()
+    x_train, y_train = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_train, ra_ed)
+    x_val, y_val = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_valid, ra_ed)
+    print(len(y_train))
+    graph.fit(x_train, y_train, x_val, y_val)
+    print("耗时:" + str(time.time()-time_start))
+
+# 1425170/1425170 [==============================] - 648s 455us/step - loss: 0.8771 - acc: 0.7317 - val_loss: 1.2533 - val_acc: 0.7069
+# Epoch 00001: val_loss improved from inf to 1.25335, saving model to D:\workspace\pythonMyCode\django_project\ClassificationTextChinese/data/model/fast_text/model_fast_text.f5
+# Epoch 2/20
+# TIME: 20 * 5轮= 100 (min)
+# acc: 0.7317
--- a/keras_textclassification/m03_CharCNN/init.py
+++ b/keras_textclassification/m03_CharCNN/init.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/7 23:08
+# @author   :Mo
+# @function :
--- a/keras_textclassification/m03_CharCNN/graph_yoon_kim.py
+++ b/keras_textclassification/m03_CharCNN/graph_yoon_kim.py
@ -0,0 +1,246 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/8 11:45
+# @author   :Mo
+# @function :char CNN of 'Yoon Kim'
+# paper: 2015, Character-Aware Neural Language Models(https://arxiv.org/abs/1508.06615)
+
+from __future__ import print_function, division
+
+import keras
+import numpy as np
+from keras import backend as K
+from keras import regularizers
+from keras.engine import Layer
+from keras.initializers import Constant
+from keras.layers import Bidirectional, GRU
+# char cnn
+from keras.layers import Convolution2D, MaxPooling2D
+from keras.layers import Dense, Activation, Multiply, Add, Lambda
+from keras.layers import Dropout, Reshape, Concatenate, BatchNormalization
+from keras.layers import TimeDistributed, Flatten
+from keras.models import Model
+
+from keras_textclassification.base import graph
+
+
+class CharCNNGraph(graph):
+    def __init__(self, hyper_parameters):
+        """
+            初始化
+        :param hyper_parameters: json，超参
+        """
+        self.char_cnn_layers = hyper_parameters['model'].get('char_cnn_layers',
+            [[50, 1], [100, 2], [150, 3], [200, 4], [200, 5], [200, 6], [200, 7]]) # large
+            # [[25, 1], [50, 2], [75, 3], [100, 4], [125, 5], [150, 6]])  # small
+        self.highway_layers = hyper_parameters['model'].get('highway_layers', 2)
+        self.num_rnn_layers = hyper_parameters['model'].get('num_rnn_layers', 2)
+        self.rnn_type = hyper_parameters['model'].get('rnn_type', 'LSTM')
+        self.rnn_units = hyper_parameters['model'].get('rnn_units', 650) # large, small is 300
+        self.len_max_word = hyper_parameters['model'].get('len_max_word', 30)
+        super().__init__(hyper_parameters)
+
+    def create_model(self, hyper_parameters):
+        """
+            构建神经网络
+        :param hyper_parameters:json,  hyper parameters of network
+        :return: tensor, moedl
+        """
+        super().create_model(hyper_parameters)
+        embedding_output = self.word_embedding.output
+        embedding_output = Reshape((self.len_max, self.embed_size, 1))(embedding_output) # (None, 50, 30, 1)
+        embedding_output = Concatenate()([embedding_output for i in range(self.len_max_word)]) # (None, 50, 30, 21)
+        embedding_output = Reshape((self.len_max, self.len_max_word, self.embed_size))(embedding_output) # (None, 50, 21, 30)
+        conv_out = []
+        for char_cnn_size in self.char_cnn_layers:
+            conv = Convolution2D(name='Convolution2D_{}_{}'.format(char_cnn_size[0], char_cnn_size[1]),
+                                 filters=char_cnn_size[0],
+                                 kernel_size= (1, char_cnn_size[1]),
+                                 activation='tanh')(embedding_output)
+            pooled = MaxPooling2D(name='MaxPooling2D_{}_{}'.format(char_cnn_size[0], char_cnn_size[1]),
+                                  pool_size=(1, self.len_max_word - char_cnn_size[1] + 1)
+                                  )(conv)
+            conv_out.append(pooled)
+        x = Concatenate()(conv_out) #  (None, 50, 1, 1100)
+        x = Reshape((self.len_max, K.int_shape(x)[2] * sum(np.array([ccl[0] for ccl in self.char_cnn_layers]))))(x) # (None, 50, 1100)
+        x = BatchNormalization()(x)
+        # Highway layers
+        for hl in range(self.highway_layers):
+            # 两个都可以，第二个是我自己写的
+            # x = TimeDistributed(Highway(activation='sigmoid', transform_gate_bias=-2, input_shape=K.int_shape(x)[1:2]))(x)
+            x = TimeDistributed(Lambda(highway_keras, name="highway_keras"))(x)
+        # rnn layers
+        for nrl in range(self.num_rnn_layers):
+            x = Bidirectional(GRU(units=self.rnn_units, return_sequences=True,
+                                         kernel_regularizer=regularizers.l2(0.32 * 0.1),
+                                         recurrent_regularizer=regularizers.l2(0.32)
+                                   ))(x)
+            # x = GRU(units=self.rnn_units, return_sequences=True,
+            #                        kernel_regularizer=regularizers.l2(0.32 * 0.1),
+            #                        recurrent_regularizer=regularizers.l2(0.32)
+            #                        )(x)
+
+            x = Dropout(self.dropout)(x)
+        x = Flatten()(x)
+        output = Dense(units=self.label, activation=self.activate_classify)(x)
+        self.model = Model(inputs=self.word_embedding.input, outputs=output)
+        self.model.summary(120)
+
+
+def highway_keras(x):
+    # writter by my own
+    # paper； Highway Network(http://arxiv.org/abs/1505.00387).
+    # 公式
+    # 1. s = sigmoid(Wx + b)
+    # 2. z = s * relu(Wx + b) + (1 - s) * x
+    # x shape : [N * time_depth, sum(filters)]
+
+    # Table 1. CIFAR-10 test set accuracy of convolutional highway networks with
+    # rectified linear activation and sigmoid gates.
+    # For comparison, results reported by Romero et al. (2014)
+    # using maxout networks are also shown.
+    # Fitnets were trained using a two step training procedure using soft targets from the trained Teacher network,
+    # which was trained using backpropagation. We trained all highway networks directly using backpropagation.
+    # * indicates networks which were trained only on a set of 40K out of 50K examples in the training set.
+
+
+
+    # Figure 2. Visualization of certain internals of the blocks in the best 50 hidden layer highway networks trained on MNIST
+    # (top row) and CIFAR-100 (bottom row). The first hidden layer is a plain layer which changes the dimensionality of the representation to 50. Each of
+    # the 49 highway layers (y-axis) consists of 50 blocks (x-axis).
+    # The first column shows the transform gate biases, which were initialized to -2 and -4 respectively.
+    # In the second column the mean output of the transform gate over 10,000 training examples is depicted.
+    # The third and forth columns show the output of the transform gates and
+    # the block outputs for a single random training sample.
+
+    gate_transform = Dense(units=K.int_shape(x)[1],
+                           activation='sigmoid',
+                           use_bias=True,
+                           kernel_initializer='glorot_uniform',
+                           bias_initializer=keras.initializers.Constant(value=-2))(x)
+    gate_cross = 1 - gate_transform
+    block_state = Dense(units=K.int_shape(x)[1],
+                        activation='relu',
+                        use_bias=True,
+                        kernel_initializer='glorot_uniform',
+                        bias_initializer='zero')(x)
+    high_way = gate_transform * block_state + gate_cross * x
+
+    return high_way
+
+gg = 0
+class Highway(Layer):
+    """
+      codes from github: https://github.com/batikim09/Keras_highways/blob/master/src/conv2d_highway.py
+    """
+    activation = None
+    transform_gate_bias = None
+
+    def __init__(self, activation='relu', transform_gate_bias=-2, **kwargs):
+        self.activation = activation
+        self.transform_gate_bias = transform_gate_bias
+        super(Highway, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+        dim = input_shape[-1]
+        self.dense_1 = Dense(units=dim, bias_initializer=Constant(self.transform_gate_bias))
+        self.dense_1.build(input_shape)
+        self.dense_2 = Dense(units=dim)
+        self.dense_2.build(input_shape)
+        self.trainable_weights = self.dense_1.trainable_weights + self.dense_2.trainable_weights
+        super(Highway, self).build(input_shape)  # Be sure to call this at the end
+
+    def call(self, x):
+        dim = K.int_shape(x)[-1]
+        transform_gate = self.dense_1(x)
+        transform_gate = Activation("sigmoid")(transform_gate)
+        carry_gate = Lambda(lambda x: 1.0 - x, output_shape=(dim,))(transform_gate)
+        transformed_data = self.dense_2(x)
+        transformed_data = Activation(self.activation)(transformed_data)
+        transformed_gated = Multiply()([transform_gate, transformed_data])
+        identity_gated = Multiply()([carry_gate, x])
+        value = Add()([transformed_gated, identity_gated])
+        return value
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = super().get_config()
+        config['activation'] = self.activation
+        config['transform_gate_bias'] = self.transform_gate_bias
+        return config
+
+# def highway(input_, size, num_layers=1, bias=-2.0, f=K.relu):
+#     """ github : https://github.com/mkroutikov/tf-lstm-char-cnn/blob/master/model.py
+#     Highway Network (cf. http://arxiv.org/abs/1505.00387).
+#     t = sigmoid(Wy + b)
+#     z = t * g(Wy + b) + (1 - t) * y
+#     where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
+#     """
+#
+#     def linear(input_, output_size, scope=None):
+#         '''
+#         Linear map: output[k] = sum_i(Matrix[k, i] * args[i] ) + Bias[k]
+#         Args:
+#             args: a tensor or a list of 2D, batch x n, Tensors.
+#         output_size: int, second dimension of W[i].
+#         scope: VariableScope for the created subgraph; defaults to "Linear".
+#       Returns:
+#         A 2D Tensor with shape [batch x output_size] equal to
+#         sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
+#       Raises:
+#         ValueError: if some of the arguments has unspecified or wrong shape.
+#       '''
+#
+#         shape = input_.get_shape().as_list()
+#         if len(shape) != 2:
+#             raise ValueError("Linear is expecting 2D arguments: %s" % str(shape))
+#         if not shape[1]:
+#             raise ValueError("Linear expects shape[1] of arguments: %s" % str(shape))
+#         input_size = shape[1]
+#
+#         # Now the computation.
+#         matrix = tf.get_variable("Matrix", [output_size, input_size], dtype=input_.dtype)
+#         bias_term = tf.get_variable("Bias", [output_size], dtype=input_.dtype)
+#
+#         return K.matmul(input_, tf.transpose(matrix)) + bias_term
+#
+#     for idx in range(num_layers):
+#         g = f(linear(input_, size, scope='highway_lin_%d' % idx))
+#
+#         t = K.sigmoid(linear(input_, size, scope='highway_gate_%d' % idx) + bias)
+#
+#         output = t * g + (1. - t) * input_
+#         input_ = output
+#     return output
+
+gg = 0
+
+# def highway_network(embedding, units):
+#         # github: https://github.com/SeonbeomKim/TensorFlow-lstm-char-cnn/blob/master/lstm_char_cnn.py
+# 		# embedding: [N*time_depth, sum(filters)]
+# 		transform_gate = tf.layers.dense(
+# 				embedding,
+# 				units=units,
+# 				activation=tf.nn.sigmoid,
+# 				kernel_initializer=self.initializer,
+# 				bias_initializer=tf.constant_initializer(-2)
+# 			) # [N*time_depth, sum(filters)]
+#
+# 		carry_gate = 1-transform_gate # [N*time_depth, sum(filters)]
+# 		block_state = tf.layers.dense(
+# 				embedding,
+# 				units=units,
+# 				activation=tf.nn.relu,
+# 				kernel_initializer=self.initializer,
+# 				bias_initializer=self.initializer
+# 			) # [N*time_depth, sum(filters)]
+# 		highway = transform_gate * block_state + carry_gate * embedding # [N*time_depth, sum(filters)]
+# 			# if transfor_gate is 1. then carry_gate is 0. so only use block_state
+# 			# if transfor_gate is 0. then carry_gate is 1. so only use embedding
+# 			# if transfor_gate is 0.@@. then carry_gate is 0.@@. so use sum of scaled block_state and embedding
+# 		return highway # [N*time_depth, sum(filters)]
+
+gg = 0
--- a/keras_textclassification/m03_CharCNN/graph_zhang.py
+++ b/keras_textclassification/m03_CharCNN/graph_zhang.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/12 14:32
+# @author   :Mo
+# @function :
--- a/keras_textclassification/m03_CharCNN/predict.py
+++ b/keras_textclassification/m03_CharCNN/predict.py
@ -0,0 +1,66 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/12 14:11
+# @author   :Mo
+# @function :
+
+import numpy as np
+
+from keras_textclassification.conf.path_config import path_embedding_random_char
+from keras_textclassification.conf.path_config import path_model_fast_text_baiduqa_2019
+from keras_textclassification.etl.text_preprocess import PreprocessText
+from keras_textclassification.m03_CharCNN import CharCNNGraph as Graph
+
+if __name__=="__main__":
+    hyper_parameters = {'model': {   'label': 17,
+                                     'batch_size': 64,
+                                     'embed_size': 30,
+                                     'filters': [2, 3, 4], # 这里无用
+                                     'kernel_size': 30,
+                                     'channel_size': 1,
+                                     'dropout': 0.5,
+                                     'decay_step': 100,
+                                     'decay_rate': 0.9,
+                                     'epochs': 20,
+                                     'len_max': 50,
+                                     'vocab_size': 20000, #这里随便填的，会根据代码里修改
+                                     'lr': 1e-3,
+                                     'l2': 1e-6,
+                                     'activate_classify': 'softmax',
+                                     'embedding_type': 'random', # 还可以填'random'、 'bert' or 'word2vec"
+                                     'is_training': False,
+                                     'model_path': path_model_fast_text_baiduqa_2019,
+                                     'char_cnn_layers': [[50, 1], [100, 2], [150, 3],[200, 4], [200, 5], [200, 6],[200, 7]],  # large
+                                     # [[25, 1], [50, 2], [75, 3], [100, 4], [125, 5], [150, 6]])  # small
+                                     'highway_layers': 1, # large:2; small:1
+                                     'num_rnn_layers': 1, # 论文是2，但训练实在是太慢了
+                                     'rnn_type': 'GRU', # type of rnn, select 'LSTM', 'GRU', 'CuDNNGRU', 'CuDNNLSTM', 'Bidirectional-LSTM', 'Bidirectional-GRU'
+                                     'rnn_units': 650,  # large 650, small is 300
+                                     'len_max_word': 26,
+                                     },
+                        'embedding':{ 'embedding_type': 'random',
+                                      'corpus_path': path_embedding_random_char,
+                                      'level_type': 'char',
+                                      'embed_size': 30,
+                                      'len_max': 50,
+                                      'len_max_word': 26
+                                      },
+                         }
+
+    pt = PreprocessText
+    graph = Graph(hyper_parameters)
+    graph.load_model()
+    ra_ed = graph.word_embedding
+    ques = '你好呀'
+    ques_embed = ra_ed.sentence2idx(ques)
+    pred = graph.predict(np.array([ques_embed]))
+    pre = pt.prereocess_idx(pred[0])
+    print(pre)
+    while True:
+        print("请输入: ")
+        ques = input()
+        ques_embed = ra_ed.sentence2idx(ques)
+        print(ques_embed)
+        pred = graph.predict(np.array([ques_embed]))
+        pre = pt.prereocess_idx(pred[0])
+        print(pre)
--- a/keras_textclassification/m03_CharCNN/train.py
+++ b/keras_textclassification/m03_CharCNN/train.py
@ -0,0 +1,76 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/8 14:37
+# @author   :Mo
+# @function :train of CharCNNGraph_kim with baidu-qa-2019 in question title
+import os
+import pathlib
+import sys
+
+project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent)
+sys.path.append(project_path)
+print(project_path)
+
+from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
+from keras_textclassification.conf.path_config import path_model_fast_text_baiduqa_2019
+from keras_textclassification.conf.path_config import path_embedding_random_char
+from keras_textclassification.etl.text_preprocess import PreprocessText
+from keras_textclassification.m03_CharCNN import CharCNNGraph
+
+import random
+
+if __name__=="__main__":
+    hyper_parameters = {'model': {   'label': 17,
+                                     'batch_size': 64,
+                                     'embed_size': 30,
+                                     'filters': [2, 3, 4], # 这里无用
+                                     'kernel_size': 30,
+                                     'channel_size': 1,
+                                     'dropout': 0.5,
+                                     'decay_step': 100,
+                                     'decay_rate': 0.9,
+                                     'epochs': 20,
+                                     'len_max': 50,
+                                     'vocab_size': 20000, #这里随便填的，会根据代码里修改
+                                     'lr': 1e-3,
+                                     'l2': 1e-6,
+                                     'activate_classify': 'softmax',
+                                     'embedding_type': 'random', # 还可以填'random'、 'bert' or 'word2vec"
+                                     'is_training': True,
+                                     'model_path': path_model_fast_text_baiduqa_2019,
+                                     'char_cnn_layers': [[50, 1], [100, 2], [150, 3],[200, 4], [200, 5], [200, 6],[200, 7]],  # large
+                                     # [[25, 1], [50, 2], [75, 3], [100, 4], [125, 5], [150, 6]])  # small
+                                     'highway_layers': 1, # large:2; small:1
+                                     'num_rnn_layers': 1, # 论文是2，但训练实在是太慢了
+                                     'rnn_type': 'GRU', # type of rnn, select 'LSTM', 'GRU', 'CuDNNGRU', 'CuDNNLSTM', 'Bidirectional-LSTM', 'Bidirectional-GRU'
+                                     'rnn_units': 650,  # large 650, small is 300
+                                     'len_max_word': 26,
+                                     },
+                        'embedding':{ 'embedding_type': 'random',
+                                      'corpus_path': path_embedding_random_char,
+                                      'level_type': 'char',
+                                      'embed_size': 30,
+                                      'len_max': 50,
+                                      'len_max_word': 26
+                                      },
+                         }
+    graph = CharCNNGraph(hyper_parameters)
+    ra_ed = graph.word_embedding
+    pt = PreprocessText()
+    x_train, y_train = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_train, ra_ed)
+    x_val, y_val = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_valid, ra_ed)
+    indexs = [ids for ids in range(len(y_train))]
+    random.shuffle(indexs)
+    x_train, y_train = x_train[indexs], y_train[indexs]
+
+    print(len(y_train))
+    graph.fit(x_train[0:32000], y_train[0:32000], x_val, y_val)
+
+
+# 1425170/1425170 [==============================] - 6498s 5ms/step - loss: 1.3809 - acc: 0.7042 - val_loss: 0.8345 - val_acc: 0.7534
+# Epoch 00001: val_loss improved from inf to 0.83452, saving model to /home/ap/nlp/myzhuo/ClassificationTextChinese/data/model/fast_text/model_fast_text.f5
+# Epoch 2/20
+# 1425170/1425170 [==============================] - 6494s 5ms/step - loss: 0.8262 - acc: 0.7518 - val_loss: 0.7535 - val_acc: 0.7705
+# Epoch 00002: val_loss improved from 0.83452 to 0.75352, saving model to /home/ap/nlp/myzhuo/ClassificationTextChinese/data/model/fast_text/model_fast_text.f5
+# Epoch 3/20
+#  306816/1425170 [=====>........................] - ETA: 1:23:55 - loss: 0.7666 - acc: 0.7673
--- a/keras_textclassification/m04_TextRNN/init.py
+++ b/keras_textclassification/m04_TextRNN/init.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/12 14:38
+# @author   :Mo
+# @function :
--- a/keras_textclassification/m04_TextRNN/graph.py
+++ b/keras_textclassification/m04_TextRNN/graph.py
@ -0,0 +1,62 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:51
+# @author   :Mo
+# @function :graph of base
+
+
+from keras import regularizers
+from keras.layers import Dense
+from keras.layers import Dropout, Flatten
+from keras.layers import LSTM, GRU, Bidirectional, CuDNNLSTM, CuDNNGRU
+from keras.models import Model
+
+from keras_textclassification.base import graph
+
+
+class TextRNNGraph(graph):
+    def __init__(self, hyper_parameters):
+        """
+            初始化
+        :param hyper_parameters: json，超参
+        """
+        self.num_rnn_layers = hyper_parameters['model'].get('num_rnn_layers', 2)
+        self.rnn_type = hyper_parameters['model'].get('rnn_type', 'LSTM')
+        self.rnn_units = hyper_parameters['model'].get('rnn_units', 650)  # large, small is 300
+        super().__init__(hyper_parameters)
+
+    def create_model(self, hyper_parameters):
+        """
+            构建神经网络
+        :param hyper_parameters:json,  hyper parameters of network
+        :return: tensor, moedl
+        """
+        super().create_model(hyper_parameters)
+        x = self.word_embedding.output
+        # x = Reshape((self.len_max, self.embed_size, 1))(embedding)
+        if self.rnn_units=="LSTM":
+                layer_cell = LSTM
+        elif self.rnn_units=="GRU":
+                layer_cell = GRU
+        elif self.rnn_units=="CuDNNLSTM":
+                layer_cell = CuDNNLSTM
+        elif self.rnn_units=="CuDNNGRU":
+                layer_cell = CuDNNGRU
+        else:
+            layer_cell = GRU
+
+        # Bi-LSTM
+        for nrl in range(self.num_rnn_layers):
+            x = Bidirectional(layer_cell(units=self.rnn_units,
+                                         return_sequences=True,
+                                         activation='relu',
+                                         kernel_regularizer=regularizers.l2(0.32 * 0.1),
+                                         recurrent_regularizer=regularizers.l2(0.32)
+                                         ))(x)
+            x = Dropout(self.dropout)(x)
+        x = Flatten()(x)
+        # 最后就是softmax
+        dense_layer = Dense(self.label, activation=self.activate_classify)(x)
+        output = [dense_layer]
+        self.model = Model(self.word_embedding.input, output)
+        self.model.summary(120)
--- a/keras_textclassification/m04_TextRNN/predict.py
+++ b/keras_textclassification/m04_TextRNN/predict.py
@ -0,0 +1,56 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:51
+# @author   :Mo
+# @function :train of fast text with baidu-qa-2019 in question title
+
+
+import numpy as np
+
+from keras_textclassification.conf.path_config import path_embedding_random_char
+from keras_textclassification.conf.path_config import path_model_fast_text_baiduqa_2019
+from keras_textclassification.etl.text_preprocess import PreprocessText
+from keras_textclassification.m04_TextRNN.graph import TextRNNGraph as Graph
+
+if __name__=="__main__":
+    hyper_parameters = { 'model': {   'label': 17,
+                                     'batch_size': 256,
+                                     'embed_size': 300,
+                                     'filters': [2, 3, 4],
+                                     'kernel_size': 300,
+                                     'channel_size': 1,
+                                     'dropout': 0.5,
+                                     'decay_step': 100,
+                                     'decay_rate': 0.9,
+                                     'epochs': 20,
+                                     'len_max': 50,
+                                     'vocab_size': 20000,
+                                     'lr': 1e-4,
+                                     'l2': 1e-9,
+                                     'activate_classify': 'softmax', # 还可以填'random'、 'bert' or 'word2vec"
+                                     'embedding_type': 'random',
+                                     'is_training': False,
+                                     'model_path': path_model_fast_text_baiduqa_2019,},
+                         'embedding':{ 'embedding_type': 'random',
+                                      'corpus_path': path_embedding_random_char,
+                                      'level_type': 'char',
+                                      'embed_size': 300,
+                                      'len_max': 50,},
+                         }
+
+    pt = PreprocessText
+    graph = Graph(hyper_parameters)
+    graph.load_model()
+    ra_ed = graph.word_embedding
+    ques = '你好呀'
+    ques_embed = ra_ed.sentence2idx(ques)
+    pred = graph.predict(np.array([ques_embed]))
+    pre = pt.prereocess_idx(pred[0])
+    print(pre)
+    while True:
+        print("请输入: ")
+        ques = input()
+        ques_embed = ra_ed.sentence2idx(ques)
+        pred = graph.predict(np.array([ques_embed]))
+        pre = pt.prereocess_idx(pred[0])
+        print(pre)
--- a/keras_textclassification/m04_TextRNN/train.py
+++ b/keras_textclassification/m04_TextRNN/train.py
@ -0,0 +1,67 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/3 10:51
+# @author   :Mo
+# @function :train of fast text with baidu-qa-2019 in question title
+
+import pathlib
+import sys
+import os
+
+project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent)
+sys.path.append(project_path)
+print(project_path)
+
+from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
+from keras_textclassification.conf.path_config import path_model_fast_text_baiduqa_2019
+from keras_textclassification.conf.path_config import path_embedding_random_char
+from keras_textclassification.etl.text_preprocess import PreprocessText
+from keras_textclassification.m04_TextRNN.graph import TextRNNGraph as Graph
+
+if __name__=="__main__":
+    hyper_parameters = {'model': {   'label': 17,
+                                     'batch_size': 16,
+                                     'embed_size': 30,
+                                     'filters': [2, 3, 4], # 这里无用
+                                     'kernel_size': 30,
+                                     'channel_size': 1,
+                                     'dropout': 0.5,
+                                     'decay_step': 100,
+                                     'decay_rate': 0.9,
+                                     'epochs': 20,
+                                     'len_max': 50,
+                                     'vocab_size': 20000, #这里随便填的，会根据代码里修改
+                                     'lr': 1e-3,
+                                     'l2': 1e-6,
+                                     'activate_classify': 'softmax',
+                                     'embedding_type': 'random', # 还可以填'random'、 'bert' or 'word2vec"
+                                     'is_training': True,
+                                     'model_path': path_model_fast_text_baiduqa_2019,
+                                     'num_rnn_layers': 1, # 论文是2，但训练实在是太慢了
+                                     'rnn_type': 'GRU', # type of rnn, select 'LSTM', 'GRU', 'CuDNNGRU', 'CuDNNLSTM', 'Bidirectional-LSTM', 'Bidirectional-GRU'
+                                     'rnn_units': 256,  # large 650, small is 300
+                                     },
+                        'embedding':{ 'embedding_type': 'random',
+                                      'corpus_path': path_embedding_random_char,
+                                      'level_type': 'char',
+                                      'embed_size': 30,
+                                      'len_max': 50,
+                                      },
+                         }
+    import time
+    time_start  = time.time()
+    graph = Graph(hyper_parameters)
+    ra_ed = graph.word_embedding
+    pt = PreprocessText()
+    x_train, y_train = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_train, ra_ed)
+    x_val, y_val = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_valid, ra_ed)
+    print(len(y_train))
+    graph.fit(x_train, y_train, x_val, y_val)
+    print("耗时:" + str(time.time()-time_start))
+
+    # indexs = [ids for ids in range(len(y_train))]
+    # random.shuffle(indexs)
+    # x_train, y_train = x_train[indexs], y_train[indexs]
+    # graph.fit(x_train[0:32000], y_train[0:32000], x_val[0:3200], y_val[0:3200])
+
+    graph.fit(x_train, y_train, x_val, y_val)
--- a/keras_textclassification/m05_TextRCNN/init.py
+++ b/keras_textclassification/m05_TextRCNN/init.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/12 15:41
+# @author   :Mo
+# @function :
--- a/keras_textclassification/m05_TextRCNN/graph.py
+++ b/keras_textclassification/m05_TextRCNN/graph.py
@ -0,0 +1,103 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/8 11:45
+# @author   :Mo
+# @function :RCNN model
+# paper: http://www.nlpr.ia.ac.cn/cip/~liukang/liukangPageFile/Recurrent%20Convolutional%20Neural%20Networks%20for%20Text%20Classification.pdf
+
+from __future__ import print_function, division
+
+from keras.layers import Conv2D, MaxPooling2D, Dense, Lambda
+from keras.layers import Dropout, Reshape, Concatenate
+from keras.layers import LSTM, GRU
+from keras.layers import Flatten
+from keras.models import Model
+from keras import backend as K
+from keras import regularizers
+
+from keras_textclassification.base import graph
+
+
+
+class RCNNGraph(graph):
+    def __init__(self, hyper_parameters):
+        """
+            初始化
+        :param hyper_parameters: json，超参
+        """
+        self.rnn_type = hyper_parameters['model'].get('rnn_type', 'LSTM')
+        self.rnn_units = hyper_parameters['model'].get('rnn_units', 650) # large, small is 300
+        super().__init__(hyper_parameters)
+
+    def create_model(self, hyper_parameters):
+        """
+            构建神经网络
+        :param hyper_parameters:json,  hyper parameters of network
+        :return: tensor, moedl
+        """
+        super().create_model(hyper_parameters)
+        embedding_output = self.word_embedding.output
+        # rnn layers
+        if self.rnn_units=="LSTM":
+                layer_cell = LSTM
+        else:
+            layer_cell = GRU
+        # 反向
+        x_backwords = layer_cell(units=self.rnn_units,
+                                    return_sequences=True,
+                                    kernel_regularizer=regularizers.l2(0.32 * 0.1),
+                                    recurrent_regularizer=regularizers.l2(0.32),
+                                    go_backwards = True)(embedding_output)
+        x_backwords_reverse = Lambda(lambda x: K.reverse(x, axes=1))(x_backwords)
+        # 前向
+        x_fordwords = layer_cell(units=self.rnn_units,
+                                    return_sequences=True,
+                                    kernel_regularizer=regularizers.l2(0.32 * 0.1),
+                                    recurrent_regularizer=regularizers.l2(0.32),
+                                    go_backwards = False)(embedding_output)
+        # 拼接
+        x_feb = Concatenate(axis=2)([x_fordwords, embedding_output, x_backwords_reverse])
+
+        ####使用多个卷积核##################################################
+        x_feb = Dropout(self.dropout)(x_feb)
+        # Concatenate后的embedding_size
+        dim_2 = K.int_shape(x_feb)[2]
+        x_feb_reshape = Reshape((self.len_max, dim_2, 1))(x_feb)
+        # 提取n-gram特征和最大池化， 一般不用平均池化
+        conv_pools = []
+        for filter in self.filters:
+            conv = Conv2D(filters = self.kernel_size,
+                          kernel_size = (filter, dim_2),
+                          padding = 'valid',
+                          kernel_initializer = 'normal',
+                          activation = 'relu',
+                          )(x_feb_reshape)
+            pooled = MaxPooling2D(pool_size = (self.len_max - filter + 1, 1),
+                                   strides = (1, 1),
+                                   padding = 'valid',
+                                   )(conv)
+            conv_pools.append(pooled)
+        # 拼接
+        x = Concatenate()(conv_pools)
+        x = Flatten()(x)
+        #########################################################################
+
+        output = Dense(units=self.label, activation=self.activate_classify)(x)
+        self.model = Model(inputs=self.word_embedding.input, outputs=output)
+        self.model.summary(120)
+
+
+# 卷积的2种方式
+# # 1 github: https://github.com/ShawnyXiao/TextClassification-Keras/tree/master/model/RCNN/rcnn.py
+# x = Conv1D(64, kernel_size=1, activation='tanh')(x)
+# x = GlobalMaxPooling1D()(x)
+#
+#
+# # 2 github : https://github.com/airalcorn2/Recurrent-Convolutional-Neural-Network-Text-Classifier/blob/master/recurrent_convolutional_keras.py
+# semantic = Conv1D(hidden_dim_2, kernel_size=1, activation="tanh")()  # See equation (4).
+# # Keras provides its own max-pooling layers, but they cannot handle variable length input
+# # (as far as I can tell). As a result, I define my own max-pooling layer here.
+# pool_rnn = Lambda(lambda x: backend.max(x, axis=1), output_shape=(hidden_dim_2,))(semantic)  # See equation (5).
+
+
+
--- a/keras_textclassification/m05_TextRCNN/predict.py
+++ b/keras_textclassification/m05_TextRCNN/predict.py
@ -0,0 +1,60 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/12 14:11
+# @author   :Mo
+# @function :
+
+import numpy as np
+
+from keras_textclassification.conf.path_config import path_embedding_random_char
+from keras_textclassification.conf.path_config import path_model_fast_text_baiduqa_2019
+from keras_textclassification.etl.text_preprocess import PreprocessText
+from keras_textclassification.m05_TextRCNN.graph import RCNNGraph as Graph
+
+if __name__=="__main__":
+    hyper_parameters = {'model': {   'label': 17,
+                                     'batch_size': 64,
+                                     'embed_size': 30,
+                                     'filters': [2, 3, 4], # 这里无用
+                                     'kernel_size': 30,
+                                     'channel_size': 1,
+                                     'dropout': 0.5,
+                                     'decay_step': 100,
+                                     'decay_rate': 0.9,
+                                     'epochs': 20,
+                                     'len_max': 50,
+                                     'vocab_size': 20000, #这里随便填的，会根据代码里修改
+                                     'lr': 1e-3,
+                                     'l2': 1e-6,
+                                     'activate_classify': 'softmax',
+                                     'embedding_type': 'random', # 还可以填'random'、 'bert' or 'word2vec"
+                                     'is_training': False,
+                                     'model_path': path_model_fast_text_baiduqa_2019,
+                                     'rnn_type': 'GRU', # type of rnn, select 'LSTM', 'GRU', 'CuDNNGRU', 'CuDNNLSTM', 'Bidirectional-LSTM', 'Bidirectional-GRU'
+                                     'rnn_units': 256,  # large 650, small is 300
+                                     },
+                        'embedding':{ 'embedding_type': 'random',
+                                      'corpus_path': path_embedding_random_char,
+                                      'level_type': 'char',
+                                      'embed_size': 30,
+                                      'len_max': 50,
+                                      },
+                         }
+
+    pt = PreprocessText
+    graph = Graph(hyper_parameters)
+    graph.load_model()
+    ra_ed = graph.word_embedding
+    ques = '我要打王者荣耀'
+    ques_embed = ra_ed.sentence2idx(ques)
+    pred = graph.predict(np.array([ques_embed]))
+    pre = pt.prereocess_idx(pred[0])
+    print(pre)
+    while True:
+        print("请输入: ")
+        ques = input()
+        ques_embed = ra_ed.sentence2idx(ques)
+        print(ques_embed)
+        pred = graph.predict(np.array([ques_embed]))
+        pre = pt.prereocess_idx(pred[0])
+        print(pre)
--- a/keras_textclassification/m05_TextRCNN/train.py
+++ b/keras_textclassification/m05_TextRCNN/train.py
@ -0,0 +1,64 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/8 14:37
+# @author   :Mo
+# @function :train of RCNNGraph_kim with baidu-qa-2019 in question title
+import os
+import pathlib
+import sys
+
+project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent)
+sys.path.append(project_path)
+print(project_path)
+
+from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
+from keras_textclassification.conf.path_config import path_model_fast_text_baiduqa_2019
+from keras_textclassification.conf.path_config import path_embedding_random_char
+from keras_textclassification.etl.text_preprocess import PreprocessText
+from keras_textclassification.m05_TextRCNN.graph import RCNNGraph as Graph
+
+import random
+
+if __name__=="__main__":
+    hyper_parameters = {'model': {   'label': 17,
+                                     'batch_size': 64,
+                                     'embed_size': 30,
+                                     'filters': [2, 3, 4],
+                                     'kernel_size': 30,
+                                     'channel_size': 1,
+                                     'dropout': 0.5,
+                                     'decay_step': 100,
+                                     'decay_rate': 0.9,
+                                     'epochs': 20,
+                                     'len_max': 50,
+                                     'vocab_size': 20000, #这里随便填的，会根据代码里修改
+                                     'lr': 1e-3,
+                                     'l2': 1e-6,
+                                     'activate_classify': 'softmax',
+                                     'embedding_type': 'random', # 还可以填'random'、 'bert' or 'word2vec"
+                                     'is_training': True,
+                                     'model_path': path_model_fast_text_baiduqa_2019,
+                                     'rnn_type': 'GRU', # type of rnn, select 'LSTM', 'GRU', 'CuDNNGRU', 'CuDNNLSTM', 'Bidirectional-LSTM', 'Bidirectional-GRU'
+                                     'rnn_units': 256,  # large 650, small is 300
+                                     },
+                        'embedding':{ 'embedding_type': 'random',
+                                      'corpus_path': path_embedding_random_char,
+                                      'level_type': 'char',
+                                      'embed_size': 30,
+                                      'len_max': 50,
+                                      },
+                         }
+    graph = Graph(hyper_parameters)
+    ra_ed = graph.word_embedding
+    pt = PreprocessText()
+    x_train, y_train = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_train, ra_ed)
+    x_val, y_val = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_valid, ra_ed)
+    indexs = [ids for ids in range(len(y_train))]
+    random.shuffle(indexs)
+    x_train, y_train = x_train[indexs], y_train[indexs]
+
+    print(len(y_train))
+    # 只训练部分
+    # graph.fit(x_train[0:32000], y_train[0:32000], x_val[0:3200], y_val[0:3200])
+
+    graph.fit(x_train, y_train, x_val, y_val)
--- a/keras_textclassification/m06_TextDCNN/init.py
+++ b/keras_textclassification/m06_TextDCNN/init.py
@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/6/12 18:15
+# @author   :Mo
+# @function :