From 4c86e528d7ee1057a14d5234b6030fcb95895d61 Mon Sep 17 00:00:00 2001 From: yongzhuo <2714618994@qq.com> Date: Wed, 14 Aug 2019 22:30:32 +0800 Subject: [PATCH] multi-class and charCNN-zhang --- README.md | 8 +- keras_textclassification/conf/path_config.py | 5 + .../data/byte_multi_news/labels.csv | 1070 +++++++++++++++++ .../data/byte_multi_news/readme.md | 9 + .../data/byte_multi_news/train.csv | 170 +++ .../data/byte_multi_news/valid.csv | 98 ++ .../data_preprocess/text_preprocess.py | 180 +++ .../m03_CharCNN/graph_zhang.py | 53 +- .../m03_CharCNN/train_zhang.py | 100 ++ .../multi_class}/__init__.py | 2 +- test/multi_class/predict_multi.py | 84 ++ test/multi_class/train_multi.py | 90 ++ 12 files changed, 1866 insertions(+), 3 deletions(-) create mode 100644 keras_textclassification/data/byte_multi_news/labels.csv create mode 100644 keras_textclassification/data/byte_multi_news/readme.md create mode 100644 keras_textclassification/data/byte_multi_news/train.csv create mode 100644 keras_textclassification/data/byte_multi_news/valid.csv create mode 100644 keras_textclassification/m03_CharCNN/train_zhang.py rename {keras_textclassification/data/model/fast_text => test/multi_class}/__init__.py (71%) create mode 100644 test/multi_class/predict_multi.py create mode 100644 test/multi_class/train_multi.py diff --git a/README.md b/README.md index e5be383..c264b65 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ # run(test/sample实例) - bert,word2vec,random样例在test/目录下, 注意word2vec(char or word), random-word, bert(chinese_L-12_H-768_A-12)未全部加载,需要下载 + - multi_class/目录下以text-cnn为例进行多标签分类实例,转化为multi-onehot标签类别,分类则取一定阀值的类 - predict_bert_text_cnn.py - tet_char_bert_embedding.py - tet_char_random_embedding.py @@ -40,6 +41,10 @@ - baidu_qa_2019(百度qa问答语料,只取title作为分类样本,17个类,有一个是空'',已经压缩上传) - baike_qa_train.csv - baike_qa_valid.csv + -byte_multi_news(今日头条2018新闻标题多标签语料,1070个标签,fate233爬取, 地址为: [byte_multi_news](https://github.com/fate233/toutiao-multilevel-text-classfication-dataset)) + -labels.csv + -train.csv + -valid.csv - embeddings - chinese_L-12_H-768_A-12/(取谷歌预训练好点的模型,已经压缩上传) - term_char.txt(已经上传, 项目中已全, wiki字典, 还可以用新华字典什么的) @@ -57,7 +62,8 @@ # 模型与论文paper题与地址 * FastText: [Bag of Tricks for Efficient Text Classification](https://arxiv.org/abs/1607.01759) * TextCNN: [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) -* charCNN: [Character-Aware Neural Language Models](https://arxiv.org/abs/1508.06615) +* charCNN-kim: [Character-Aware Neural Language Models](https://arxiv.org/abs/1508.06615) +* charCNN-zhang: [Character-level Convolutional Networks for Text Classification](https://arxiv.org/pdf/1509.01626.pdf) * TextRNN: [Recurrent Neural Network for Text Classification with Multi-Task Learning](https://www.ijcai.org/Proceedings/16/Papers/408.pdf) * RCNN: [Recurrent Convolutional Neural Networks for Text Classification](http://www.nlpr.ia.ac.cn/cip/~liukang/liukangPageFile/Recurrent%20Convolutional%20Neural%20Networks%20for%20Text%20Classification.pdf) * DCNN: [A Convolutional Neural Network for Modelling Sentences](https://arxiv.org/abs/1404.2188) diff --git a/keras_textclassification/conf/path_config.py b/keras_textclassification/conf/path_config.py index 20c46ea..b8e8009 100644 --- a/keras_textclassification/conf/path_config.py +++ b/keras_textclassification/conf/path_config.py @@ -21,6 +21,11 @@ path_embedding_vector_word2vec_word = path_root + '/data/embeddings/w2v_model_me path_baidu_qa_2019_train = path_root + '/data/baidu_qa_2019/baike_qa_train.csv' path_baidu_qa_2019_valid = path_root + '/data/baidu_qa_2019/baike_qa_valid.csv' +# 今日头条新闻多标签分类 +path_byte_multi_news_train = path_root + '/data/byte_multi_news/train.csv' +path_byte_multi_news_valid = path_root + '/data/byte_multi_news/valid.csv' +path_byte_multi_news_label = path_root + '/data/byte_multi_news/labels.csv' + # fast_text config # 模型目录 path_model_dir = path_root + "/data/model/fast_text/" diff --git a/keras_textclassification/data/byte_multi_news/labels.csv b/keras_textclassification/data/byte_multi_news/labels.csv new file mode 100644 index 0000000..7268dec --- /dev/null +++ b/keras_textclassification/data/byte_multi_news/labels.csv @@ -0,0 +1,1070 @@ +agriculture/animal_husbandry +agriculture/countryside +agriculture/farmer +agriculture/farming +agriculture/fishery +agriculture/forestry +all +article +blog +buddhism +business +career +cc +comic +design +design/architecture +digital +digital/UAV +digital/appliances +digital/appliances/air_conditioner +digital/appliances/fridge +digital/appliances/other +digital/appliances/small_home_appliance +digital/appliances/television +digital/appliances/washer +digital/browser +digital/cellphone +digital/cellphone/huawei_phone +digital/cellphone/other +digital/computer +digital/computer/laptop +digital/computer/laptop/ThinkPad +digital/computer/laptop/other +digital/computer/other +digital/computer_peripheral/mechanical_keyboard +digital/computer_peripheral/printer +digital/coolplay +digital/coolplay/other +digital/coolplay/smart_home +digital/coolplay/smart_home/other +digital/coolplay/smart_home/projector +digital/coolplay/smart_home/smart_tv +digital/coolplay/virtual_reality +digital/coolplay/wearable_devices +digital/digital_evaluating +digital/digital_soft +digital/gaming_peripherals +digital/hardware +digital/headset +digital/input_method +digital/operating_system +digital/operating_system/Android +digital/operating_system/other +digital/other +digital/pad +digital/photography +digital/photography/camera +digital/photography/other +digital/robot +digital/router +emotion +emotion/affection +emotion/affection/homosexuality +emotion/affection/love +emotion/affection/other +emotion/jitang +emotion/low_grage_emotion +emotion/marriage +emotion/marriage/extramarital_affair +emotion/marriage/other +emotion/other +fashion/cosmetology/beautify +fashion/cosmetology/beautify/skin_care +fashion/cosmetology/body_care +fashion/cosmetology/make_up +fashion/cosmetology/manicure +fashion/luxury/watch +fashion_wedding +fashion_wedding/other +fashion_wedding/wedding +fitness +funny +funny/funny_article +funny/marvel +funny/other +gallery_all +gallery_story +garbage +general_positive +general_positive/blessing +general_positive/culture +general_positive/essay +general_positive/inspiration +general_positive/other +general_positive/pride +general_positive/selfless +government +graduate +history/chinese_history +history/chinese_history/modern_history_china +hk_news_china +immigration +impact +impact_toutiao +impact_wukong +impact_xigua +leisure_life/angling +lottery +lottery/other +lottery/sports_lottery +marvel +million_hero +movie +news +news/china_show +news/nausea +news/pandian +news/singles_day +news/temai +news_agriculture +news_agriculture/animal_husbandry +news_agriculture/countryside +news_agriculture/farmer +news_agriculture/farming +news_agriculture/fishery +news_agriculture/forestry +news_agriculture/other +news_agriculture/planting +news_astrology +news_astrology/constellation_test +news_astrology/horoscope +news_astrology/numerology +news_astrology/numerology/chinese_zodiac +news_astrology/numerology/divination +news_astrology/numerology/other +news_astrology/other +news_baby +news_baby/baby_growth +news_baby/baby_growth/baby_nurturing/baby_nursing +news_baby/baby_growth/baby_parenting +news_baby/baby_growth/baby_parenting/other +news_baby/baby_growth/baby_parenting/pre_education +news_baby/baby_growth/breast_feeding +news_baby/baby_growth/childrens_books +news_baby/baby_growth/kindergarten +news_baby/baby_growth/other +news_baby/baby_nursing +news_baby/baby_nurturing +news_baby/baby_nurturing/baby_nursing +news_baby/baby_nurturing/other +news_baby/other +news_baby/pregnancy +news_baby/pregnancy/childbirth +news_baby/pregnancy/other +news_baby/pregnancy/postpartum_care +news_baby/pregnancy/pre_pregnancy +news_baby/pregnancy/pregnancying +news_beauty +news_car +news_car/MPV +news_car/SUV +news_car/SUV/off_road_car +news_car/SUV/other +news_car/american_car +news_car/british_car +news_car/car_culture +news_car/car_dealers +news_car/car_design +news_car/car_evaluating +news_car/car_exhibition +news_car/car_guide +news_car/car_huabian +news_car/car_industry +news_car/car_maintenance +news_car/car_market +news_car/car_modification +news_car/car_new_arrival +news_car/car_news +news_car/car_pretty +news_car/car_rental +news_car/car_tech +news_car/car_tiche +news_car/car_travel +news_car/car_usage +news_car/car_usage/auto_parts +news_car/car_usage/auto_repair +news_car/car_usage/car_maintenance +news_car/car_usage/other +news_car/commercial_car +news_car/domestic_car +news_car/french_car +news_car/german_car +news_car/huohua_car_evaluating +news_car/huohua_car_guide +news_car/huohua_car_market +news_car/huohua_car_usage +news_car/intelligent_travel +news_car/japanese_car +news_car/ltalian_car +news_car/luxury_car +news_car/motorcycle +news_car/new_energy_car +news_car/new_energy_car/electric_car +news_car/new_energy_car/other +news_car/off_road_car +news_car/other +news_car/south_korean_car +news_car/sports_car +news_car/touring_car +news_car/unmanned +news_car/used_car +news_career +news_career/career_planning +news_career/other +news_collect +news_collect/artwork +news_collect/boccaro_teapot +news_collect/ceramic +news_collect/coins +news_collect/jade +news_collect/other +news_collect/plaything +news_collect/stamp +news_comic +news_comic/animation +news_comic/blood_cartoon +news_comic/china_made_cartoon +news_comic/comicbook_exhibition +news_comic/cosplay +news_comic/japan_cartoon +news_comic/japan_cartoon/other +news_comic/japan_cartoon/seiyuu +news_comic/manhua +news_comic/manhua/horror_comics +news_comic/manhua/other +news_comic/marvel_comics +news_comic/other +news_comic/west_cartoon +news_culture +news_culture/architecture +news_culture/art +news_culture/art/acrobatics +news_culture/art/ballet +news_culture/art/behavior_art +news_culture/art/calligraphy +news_culture/art/ceramic_art +news_culture/art/cross_talk +news_culture/art/folk_custom +news_culture/art/handicraft +news_culture/art/installation_art +news_culture/art/museum +news_culture/art/oil_painting +news_culture/art/other +news_culture/art/sculpture +news_culture/art/street_art +news_culture/art/street_art/graffiti +news_culture/art/street_art/other +news_culture/art/traditional_opera +news_culture/historical_relic +news_culture/japanese_culture +news_culture/other +news_culture/reading +news_culture/reading/literature +news_culture/reading/literature/ancient_poetry +news_culture/reading/literature/other +news_culture/reading/literature/science_fiction +news_culture/reading/literature/shuihu_story +news_culture/reading/literature/stone_story +news_culture/reading/literature/wuxia +news_culture/reading/net_literature +news_culture/reading/other +news_culture/reading/poetry +news_culture/traditional_chinese +news_culture/traditional_chinese/confucianism +news_culture/traditional_chinese/other +news_design +news_design/ad_design +news_design/graphic_design +news_design/industrial_design +news_design/other +news_design/web_design +news_discovery +news_edu +news_edu/art_education +news_edu/art_education/belly_dance +news_edu/art_education/dance +news_edu/art_education/guitar +news_edu/art_education/hip_hop +news_edu/art_education/musical_instrument +news_edu/art_education/national_instrument +news_edu/art_education/other +news_edu/art_education/painting +news_edu/art_education/piano +news_edu/art_education/social_dancing +news_edu/art_education/square_dance +news_edu/civil_servant_test +news_edu/edu_foreign_lang +news_edu/edu_upgrade +news_edu/edu_upgrade/adult_education +news_edu/edu_upgrade/chinese_language +news_edu/edu_upgrade/college +news_edu/edu_upgrade/college_entrance_examination +news_edu/edu_upgrade/english_language +news_edu/edu_upgrade/high_school_entrance_examination +news_edu/edu_upgrade/other +news_edu/edu_upgrade/postgraduate_examination +news_edu/edu_upgrade/qualification_test +news_edu/ethical +news_edu/family_education +news_edu/online_education +news_edu/other +news_edu/philosophy +news_edu/study_abroad +news_entertainme +news_entertainment +news_entertainment/cross_talk +news_entertainment/drama +news_entertainment/film_tv +news_entertainment/film_tv/movie +news_entertainment/film_tv/movie/action_film +news_entertainment/film_tv/movie/chinese_movie +news_entertainment/film_tv/movie/comedy_movie +news_entertainment/film_tv/movie/documentary +news_entertainment/film_tv/movie/euro_and_us_movie +news_entertainment/film_tv/movie/film_festival +news_entertainment/film_tv/movie/horror_movie +news_entertainment/film_tv/movie/japanese_movie +news_entertainment/film_tv/movie/korea_movie +news_entertainment/film_tv/movie/micro_film +news_entertainment/film_tv/movie/micro_movie +news_entertainment/film_tv/movie/other +news_entertainment/film_tv/movie/romance_film +news_entertainment/film_tv/other +news_entertainment/film_tv/tv_series +news_entertainment/film_tv/tv_series/british_tv_seriess +news_entertainment/film_tv/tv_series/hk_tv_series +news_entertainment/film_tv/tv_series/japanese_tv_series +news_entertainment/film_tv/tv_series/korea_tv_series +news_entertainment/film_tv/tv_series/korean_tv +news_entertainment/film_tv/tv_series/mainland_tv +news_entertainment/film_tv/tv_series/mainland_tv_series +news_entertainment/film_tv/tv_series/other +news_entertainment/film_tv/tv_series/taiwan_tv_series +news_entertainment/film_tv/tv_series/us_tv_series +news_entertainment/gossip +news_entertainment/hk_taiwan_entertainment +news_entertainment/japan_korea_entertainment +news_entertainment/japan_korea_entertainment/japanese_entertainment +news_entertainment/japan_korea_entertainment/korea_entertainment +news_entertainment/japan_korea_entertainment/other +news_entertainment/music +news_entertainment/music/chinese_music +news_entertainment/music/concert +news_entertainment/music/folk_music +news_entertainment/music/japanese_music +news_entertainment/music/jazz +news_entertainment/music/korea_music +news_entertainment/music/music_festival +news_entertainment/music/other +news_entertainment/music/pop_music +news_entertainment/music/rock_and_roll +news_entertainment/opera +news_entertainment/other +news_entertainment/quyi +news_entertainment/reality_television +news_entertainment/sketch_comedy +news_entertainment/stage_play +news_entertainment/star_children +news_entertainment/variety +news_entertainment/variety/euro_and_us_variety +news_entertainment/variety/hk_and_taiwan_variety +news_entertainment/variety/japanese_variety +news_entertainment/variety/korea_variety +news_entertainment/variety/other +news_entertainment/variety/reality_television +news_entertainment/vulgar +news_entertainment/west_entertainment +news_entertainment/west_entertainment/hollywood +news_entertainment/west_entertainment/other +news_essay +news_fashion +news_fashion/cosmetology +news_fashion/cosmetology/beautify +news_fashion/cosmetology/beautify/other +news_fashion/cosmetology/beautify/skin_care +news_fashion/cosmetology/body_care +news_fashion/cosmetology/body_care/other +news_fashion/cosmetology/body_care/perfume +news_fashion/cosmetology/hairdressing +news_fashion/cosmetology/make_up +news_fashion/cosmetology/make_up/cosmetics +news_fashion/cosmetology/make_up/other +news_fashion/cosmetology/manicure +news_fashion/dressing +news_fashion/dressing/jewelry +news_fashion/dressing/other +news_fashion/dressing/shoes +news_fashion/fashion_man +news_fashion/fashion_man/mens_clothing +news_fashion/fashion_man/other +news_fashion/luxury +news_fashion/luxury/diamond +news_fashion/luxury/other +news_fashion/luxury/watch +news_fashion/other +news_finance +news_finance/accounting +news_finance/business +news_finance/business/company +news_finance/business_management +news_finance/business_management/accounting +news_finance/business_management/marketing +news_finance/business_management/other +news_finance/entrepreneurship +news_finance/entrepreneurship/other +news_finance/entrepreneurship/venture_capital +news_finance/financing +news_finance/financing/bank +news_finance/financing/bank/credit_card +news_finance/financing/bank/other +news_finance/financing/crowd_funding +news_finance/financing/digital_currency +news_finance/financing/insurance +news_finance/financing/other +news_finance/hk_stock +news_finance/industrial_economy/bioenergy +news_finance/industrial_economy/coa +news_finance/industrial_economy/energy +news_finance/industrial_economy/metallurgical +news_finance/investment +news_finance/investment/finance_management +news_finance/investment/foreign_exchange +news_finance/investment/fund +news_finance/investment/futures +news_finance/investment/insurance +news_finance/investment/investment_banking/bank +news_finance/investment/investment_financing/bank +news_finance/investment/investment_financing/crowd_funding +news_finance/investment/noble_metals +news_finance/investment/noble_metals/other +news_finance/investment/noble_metals/silver +news_finance/investment/other +news_finance/investment/stock +news_finance/investment/stock/IPO +news_finance/investment/stock/architectural_decoration_industry +news_finance/investment/stock/banking_sector +news_finance/investment/stock/biological_medicine_industry +news_finance/investment/stock/blue_chips +news_finance/investment/stock/building_materials_industry +news_finance/investment/stock/chemical_industry +news_finance/investment/stock/commercial_trade_industry +news_finance/investment/stock/comprehensive_industry +news_finance/investment/stock/computer_industry +news_finance/investment/stock/ecological_economic +news_finance/investment/stock/electrical_equipment_industry +news_finance/investment/stock/electronic_industry +news_finance/investment/stock/food_beverage_industry +news_finance/investment/stock/hea_industry +news_finance/investment/stock/iron_steel_industry +news_finance/investment/stock/leisure_service_industry +news_finance/investment/stock/light_industry +news_finance/investment/stock/machine_equipment_industry +news_finance/investment/stock/media_industry +news_finance/investment/stock/mining_industry +news_finance/investment/stock/motor_dom +news_finance/investment/stock/national_defense_industry +news_finance/investment/stock/nonebank_financial_sector +news_finance/investment/stock/nonferrous_industry +news_finance/investment/stock/other +news_finance/investment/stock/public_utility +news_finance/investment/stock/real_estate_industry +news_finance/investment/stock/stock_announcement +news_finance/investment/stock/stock_closing_analysis +news_finance/investment/stock/stock_guide +news_finance/investment/stock/stock_recommend +news_finance/investment/stock/telecommunication_industry +news_finance/investment/stock/textile_garment_industry +news_finance/investment/stock/transportation_industry +news_finance/investment/stock/usa_stock +news_finance/macro_economic +news_finance/macro_economic/macro_economic_china +news_finance/macro_economic/macro_economic_world +news_finance/macro_economic/other +news_finance/marketing +news_finance/oil_price +news_finance/other +news_finance/stock_announcement +news_finance/stock_closing_analysis +news_finance/stock_guide +news_food +news_food/catering_industry +news_food/coffee +news_food/cooking_skill +news_food/dessert +news_food/food_culture +news_food/fruit +news_food/menu +news_food/menu/chinese_food +news_food/menu/japanese_cuisine +news_food/menu/other +news_food/other +news_food/seafood +news_food/snack +news_food/tea +news_food/vegetables +news_food/vegetarianism +news_food/western_food +news_food/wine_drinking +news_food/wine_drinking/grape_wine +news_food/wine_drinking/other +news_food/wine_drinking/white_wine +news_game +news_game/action_game +news_game/adventure_game +news_game/chess_game +news_game/e-game +news_game/e-game/other +news_game/e-game/somatosensory_game_machine +news_game/e-sport +news_game/e-sport/lol +news_game/e-sport/other +news_game/game_industry +news_game/mobile_game +news_game/mobile_game/im_mt +news_game/mobile_game/other +news_game/online_game +news_game/online_game/dnf +news_game/online_game/fantasy_westward_journey +news_game/online_game/hearthstone +news_game/online_game/other +news_game/online_game/wow +news_game/other +news_game/pc_game +news_game/role_playing_game +news_game/sandbox_game +news_game/shooting_game +news_game/shooting_game/cross_fire +news_game/shooting_game/other +news_game/sports_game +news_game/strategy_game +news_game/table_game +news_geomantic +news_health +news_health/angiocardiopathy +news_health/body_building +news_health/body_building/other +news_health/body_building/yoga +news_health/cancer +news_health/cancer/breast_cancer +news_health/cancer/liver_cancer +news_health/cancer/other +news_health/cancer/stomach_cancer +news_health/cervical_spondylosis +news_health/daily_health +news_health/disease +news_health/flu +news_health/gastrointestinal_diseases +news_health/health_care_food +news_health/health_soft +news_health/heredity +news_health/infectious_disease +news_health/medical_news +news_health/medical_news/doctor_patient_relationship +news_health/medical_news/healthcare_reform +news_health/medical_news/other +news_health/medicine +news_health/medicine/other +news_health/men_health +news_health/mental_health +news_health/mental_health/anxiety_disorders +news_health/mental_health/autism +news_health/mental_health/melancholia +news_health/mental_health/mental_disease +news_health/mental_health/other +news_health/nursing +news_health/obstetric +news_health/oral_care +news_health/other +news_health/paediatrics +news_health/plastic_surgery +news_health/plastic_surgery/cup_up +news_health/plastic_surgery/other +news_health/prostate +news_health/regimen +news_health/regimen/diet_therapy +news_health/regimen/healthy_eating +news_health/regimen/nutriology +news_health/regimen/other +news_health/regimen/sleep +news_health/regimen/subhealth +news_health/respiratory_disease +news_health/sexual_health +news_health/sexual_health/kidney_care +news_health/sexual_health/men_health +news_health/sexual_health/other +news_health/sexual_health/sex_skill +news_health/sexual_health/women_health +news_health/sexual_health/women_health/gynecology +news_health/sexual_health/women_health/other +news_health/skin_disease +news_health/slimming +news_health/traditional_chinese_medicine +news_health/traditional_chinese_medicine/acupoint +news_health/traditional_chinese_medicine/chinese_herbology +news_health/traditional_chinese_medicine/other +news_health/women_health +news_health/women_helath +news_history +news_history/archaeology +news_history/chemistry +news_history/chinese_history +news_history/chinese_history/ancient_chinese_history +news_history/chinese_history/ancient_chinese_history/other +news_history/chinese_history/ancient_chinese_history/spring_autumn_period +news_history/chinese_history/ancient_chinese_history/tang_dynasty +news_history/chinese_history/modern_history_china +news_history/chinese_history/other +news_history/mathematics +news_history/military_history/WWII +news_history/military_history/anti_japanese_war +news_history/other +news_history/physics +news_history/world_history +news_history/world_history/Asian_history +news_history/world_history/other +news_history/world_history/tomb +news_home +news_home/DIY +news_home/decoration +news_home/decoration/furniture +news_home/decoration/home_design +news_home/decoration/kitchen_appliances +news_home/decoration/mini_decoration +news_home/decoration/other +news_home/floriculture +news_home/home_decoration +news_home/other +news_home/succulent +news_hot +news_house +news_house/house_industry +news_house/house_market +news_house/house_market/house_purchase +news_house/house_renting +news_house/house_soft +news_house/other +news_lady +news_law +news_lieqi +news_life +news_local +news_media +news_military +news_military/aerospace +news_military/air_force +news_military/army +news_military/military_china +news_military/military_history +news_military/military_history/other +news_military/military_world +news_military/navy +news_military/navy/other +news_military/navy/warship +news_military/other +news_military/weaponry +news_military/weaponry/aircraft_carrier +news_military/weaponry/fight_plane +news_military/weaponry/missile +news_military/weaponry/other +news_nature +news_novel +news_novel/kehuanlingyi +news_novel/kehuanlingyi/kehuan +news_novel/kehuanlingyi/kehuan/moshiweiji +news_novel/kehuanlingyi/kehuan/other +news_novel/kehuanlingyi/kehuan/xingjiwenming +news_novel/kehuanlingyi/xuanyilingyi/lingyishenguai +news_novel/kehuanlingyi/xuanyilingyi/tanxianmaoxian +news_novel/kehuanlingyi/xuanyilingyi/zhentantuili +news_novel/lishijunshi +news_novel/lishijunshi/junshixiaoshuo +news_novel/lishijunshi/junshixiaoshuo/junshizhanzheng +news_novel/lishijunshi/junshixiaoshuo/other +news_novel/lishijunshi/lishixiaoshuo +news_novel/lishijunshi/lishixiaoshuo/other +news_novel/lishijunshi/lishixiaoshuo/waiguolishi +news_novel/lishijunshi/other +news_novel/mingzhujingdian +news_novel/mingzhujingdian/other +news_novel/other +news_novel/qihuanxuanhuan/qihuan +news_novel/qihuanxuanhuan/qihuan/mofahuanqing +news_novel/qihuanxuanhuan/qihuan/other +news_novel/qihuanxuanhuan/xuanhuan +news_novel/qihuanxuanhuan/xuanhuan/dongfangxuanhuan +news_novel/qihuanxuanhuan/xuanhuan/yishidalu +news_novel/tongrendanmei/danmeixiaoshuo +news_novel/tongrendanmei/danmeixiaoshuo/other +news_novel/tongrendanmei/tongrenxiaoshuo/yingshitongren +news_novel/wuxiaxianxia/wuxiaxiaoshuo +news_novel/wuxiaxianxia/wuxiaxiaoshuo/other +news_novel/yanqingxiaoshuo/langmanqingchun/qingchunxiaoyuan +news_novel/yanqingxiaoshuo/xiandaiyanqing/yulemingxing +news_novel/yingshixiaoshuo +news_novel/yingshixiaoshuo/other +news_novel/youxijingji/tiyujingji +news_novel/youxijingji/tiyujingji/other +news_novel/youxijingji/youxixiaoshuo +news_novel/youxijingji/youxixiaoshuo/xuniwangyou +news_others +news_pet +news_pet/birds +news_pet/cats +news_pet/dogs +news_pet/fishes +news_pet/other +news_pet/rabbit +news_photography +news_picture +news_politics +news_politics/civil_servant +news_politics/family_planning +news_politics/high_court +news_politics/other +news_politics/politics_law +news_politics/politics_supervision +news_politics/politics_supervision/anti_corruption +news_politics/politics_supervision/other +news_politics/taiwan_politics +news_politics_general +news_positive_values +news_psychology +news_regimen +news_society +news_society/NGO +news_society/anecdotes +news_society/civil +news_society/civil/conserve_energy_emission +news_society/civil/environmental_protection +news_society/civil/environmental_protection/air_pollution +news_society/civil/environmental_protection/environmental_pollution +news_society/civil/environmental_protection/other +news_society/civil/environmental_protection/water_pollution +news_society/civil/other +news_society/civil/philanthropy +news_society/civil/social_insurance +news_society/civil/urban_medical_insurance +news_society/employment +news_society/food_security +news_society/negative_energy_society +news_society/news_law +news_society/news_law/gangdom +news_society/news_law/narcotics +news_society/news_law/other +news_society/news_law/pyramid_schemes +news_society/news_law/statute +news_society/other +news_society/people_search +news_society/positive +news_society/rumor +news_society/weather +news_society/weather/earthquake +news_society/weather/meteorologic_disasters +news_society/weather/meteorological +news_society/weather/other +news_society/weather/typhoon +news_sports +news_sports/america_football +news_sports/america_football/NFL +news_sports/america_football/other +news_sports/angling +news_sports/badminton +news_sports/baseball +news_sports/baseball/other +news_sports/baseball/softball +news_sports/basketball +news_sports/basketball/CBA +news_sports/basketball/NBA +news_sports/basketball/basketball_china +news_sports/basketball/basketball_china/men_basketball_china +news_sports/basketball/basketball_china/other +news_sports/basketball/basketball_china/women_basketball_china +news_sports/basketball/other +news_sports/bicycle_exercise +news_sports/billiard +news_sports/billiard/other +news_sports/billiard/snooker +news_sports/car_racing +news_sports/car_racing/F1_racing +news_sports/car_racing/car_rally +news_sports/car_racing/other +news_sports/chess_card +news_sports/chess_card/chinese_chess +news_sports/chess_card/international_chess +news_sports/chess_card/other +news_sports/chess_card/weiqi +news_sports/diving +news_sports/fight +news_sports/fight/boxing +news_sports/fight/free_combat +news_sports/fight/integrated_combat +news_sports/fight/other +news_sports/fight/taekwondo +news_sports/fight/wrestling +news_sports/football +news_sports/football/football_china +news_sports/football/football_china/CSL +news_sports/football/football_china/china_league +news_sports/football/football_china/men_football_china +news_sports/football/football_china/other +news_sports/football/football_china/women_football_china +news_sports/football/football_world +news_sports/football/football_world/euro_cup_gallery +news_sports/football/football_world/football_england +news_sports/football/football_world/football_eu_champion +news_sports/football/football_world/football_france +news_sports/football/football_world/football_germany +news_sports/football/football_world/football_italy +news_sports/football/football_world/football_spain +news_sports/football/football_world/other +news_sports/football/other +news_sports/golf +news_sports/gymnastics +news_sports/hiking +news_sports/hockey +news_sports/horse_racing +news_sports/martial_arts +news_sports/mountaineering +news_sports/olympic_games +news_sports/olympic_games/olympic_gallery +news_sports/olympic_games/other +news_sports/other +news_sports/outdoor_sports +news_sports/ping_pong +news_sports/racing +news_sports/rowing +news_sports/running +news_sports/running/marathon +news_sports/running/other +news_sports/sailing_sport +news_sports/shooting +news_sports/skiing +news_sports/sport_gossip +news_sports/swimming +news_sports/tennis +news_sports/volleyball +news_sports/weightlifting +news_sports/winter_olympic +news_sports/winter_sport +news_sports/winter_sport/curling +news_sports/winter_sport/other +news_sports/winter_sport/skating +news_sports/x-sports/skateboard +news_sports/x_sports +news_sports/x_sports/other +news_sports/x_sports/skateboard +news_story +news_story/ghost_story +news_story/other +news_tech +news_tech/3D_printing +news_tech/artificial_intelligence +news_tech/communication +news_tech/communication/other +news_tech/communication/tele_carrier +news_tech/information_security +news_tech/internet +news_tech/internet/big_data +news_tech/internet/e-business +news_tech/internet/e-business/O2O +news_tech/internet/e-business/other +news_tech/internet/e_business/O2O +news_tech/internet/internet_finance +news_tech/internet/internet_finance/other +news_tech/internet/internet_finance/p2p_finance +news_tech/internet/internet_of_things +news_tech/internet/mobile_internet +news_tech/internet/mobile_internet/mobile_ad +news_tech/internet/mobile_internet/mobile_payment +news_tech/internet/mobile_internet/other +news_tech/internet/network_security +news_tech/internet/other +news_tech/internet_cellphone +news_tech/maker +news_tech/other +news_tech/software +news_traditional_culture +news_travel +news_travel/honeymoon_trip +news_travel/other +news_travel/self_driving_tour +news_travel/travel_guides +news_travel/travel_industry +news_travel/travel_information +news_travel/travel_notes +news_world +news_world/diplomacy +news_world/diplomacy/middle_east +news_world/diplomacy/other +news_world/diplomacy/sino_american_relations +news_world/diplomacy/sino_japanese_relations +news_world/diplomacy/sino_russian_relations +news_world/diplomacy/south_china_sea +news_world/diplomacy/south_china_sea/nansha_islands +news_world/diplomacy/south_china_sea/other +news_world/diplomacy/south_china_sea/xisha_islands +news_world/other +news_world/overseas_chinese +news_world/world_society +news_world/world_society/illegal_immigrant +news_world/world_society/other +nineteenth +npc_cppcc +other_niche +outdoor_live +photography +positive +positive_selfless +public_welfare +qingyunjihua +rumor +science +science_all +science_all/alien_life +science_all/animal +science_all/animal/animal +science_all/animal/bear +science_all/animal/dinosaur +science_all/animal/elephant +science_all/animal/horse +science_all/animal/insects +science_all/animal/lion +science_all/animal/other +science_all/animal/snake +science_all/animal/wolf +science_all/astronomy +science_all/astronomy/alien_life +science_all/astronomy/mars_exploration +science_all/astronomy/other +science_all/atmospheric_science +science_all/bio_technology +science_all/biomedical_science +science_all/chemistry +science_all/dinosaur +science_all/discovery +science_all/geography +science_all/geological +science_all/mars_exploration +science_all/material_science +science_all/mathematics +science_all/neuro_science +science_all/nuclear_technology +science_all/oceanography +science_all/other +science_all/physics +science_all/plant +science_all/remote_sensing +science_all/science +science_all/transgene +society/negtive_energy_society +spring_festival +technique +technique/DevOps +technique/NoSQL +technique/algorithm +technique/cloud_computing +technique/data_mining +technique/database +technique/distributed_computation +technique/machine_learning +technique/natural_language_processing +technique/open_source_software +technique/other +technique/programming_language +technique/search_engine +technique/supercomputer +technique/text_analysis +technique/text_editor +technique/virtual_machine +temai_all +tip +two_sessions +video_animal +video_astrology +video_auto +video_auto/video_auto_others +video_auto/video_auto_suv +video_baby +video_beauty +video_car +video_car/car_accident +video_car2 +video_car_other +video_child +video_child/video_child_others +video_child/video_child_rearing +video_chinese_show +video_constellatory +video_culture +video_curiosity +video_design +video_digital +video_diy_manifacture +video_domestic +video_edu +video_edu/video_edu_others +video_emotion +video_ent +video_ent/video_ent_info +video_ent/video_ent_info/video_ent_info_discuss +video_ent/video_ent_info/video_ent_info_others +video_ent/video_music +video_ent/video_music/video_music_live +video_ent/video_music/video_music_niuren +video_ent/video_zongyi +video_ent/video_zongyi/video_zongyi_others +video_entertainm +video_entertainment +video_fashion +video_fashion/other +video_fashion/video_fashion_movie +video_finance +video_finance/other +video_finance/video_finance_program +video_fishing +video_food +video_food/video_food_others +video_funny +video_funny/video_funny_others +video_funny/video_funny_waichang +video_funny_dub +video_funny_others +video_game +video_gaming +video_health +video_health/video_health_goodlife +video_history +video_life +video_life_tips +video_military +video_motherbaby +video_movie +video_movie_others +video_music +video_others +video_others/video_others_others +video_painting +video_pet +video_politics +video_scientific +video_social +video_social/video_social_info +video_social/video_social_news +video_social/video_social_others +video_social/video_social_report +video_social_others +video_society +video_sports +video_sports/video_football +video_sports/video_nba +video_squaredance +video_tech +video_tech/video_internet +video_travel +video_travel/other +video_variety +video_vehicles +video_voice +video_world +worldcup diff --git a/keras_textclassification/data/byte_multi_news/readme.md b/keras_textclassification/data/byte_multi_news/readme.md new file mode 100644 index 0000000..ba1b644 --- /dev/null +++ b/keras_textclassification/data/byte_multi_news/readme.md @@ -0,0 +1,9 @@ +# + *Դ: [https://github.com/fate233/toutiao-multilevel-text-classfication-dataset](https://github.com/fate233/toutiao-multilevel-text-classfication-dataset) + *ǩ: 1070 + *Ŀ:ԭʼÿļԼ605ļ + *ָ: |,| + *:news_world/other,news_world|,|ͳգǰũ + *ݸʽ:labelquestion + *ԭʼ:1000866069|,|tip,news|,|ѶPPTƱ!ʮнõóֵPPT|,|,,ppt,powerpoint,õƬ,ʾĸ,΢,б|,| + *ԭʼݸʽ:ID룬ַ⣩Źؼʣ \ No newline at end of file diff --git a/keras_textclassification/data/byte_multi_news/train.csv b/keras_textclassification/data/byte_multi_news/train.csv new file mode 100644 index 0000000..9cf094c --- /dev/null +++ b/keras_textclassification/data/byte_multi_news/train.csv @@ -0,0 +1,170 @@ +label|,|ques +news_sports/basketball/NBA,news_sports/other,news_sports|,|那些被詹皇打散重建的球队 +news_tech/internet/e-business/O2O,news_finance/entrepreneurship/other,news_finance/entrepreneurship,news_finance|,|“三个成倍”是衡量O2O创业项目能否成功的关键 +news_sports,news_entertainment|,|快看!梅西演电影了 网友:拉玛西亚影视学院最正宗 +news_society/civil/social_insurance,news_society/other,news_agriculture/farmer,news_society,news_agriculture|,|农民50岁还干活,只为买“退休”,一次缴满9万元划算不划算? +news_home/other,news_home|,|只需做好这四点,就能让你养的天竺葵全年花开不断! +news_travel/other,news_home/floriculture,news_travel|,|长沙,红梅盛开含笑迎接新年 +news|,|赋得真情必动人,那些轻触心尖儿的情诗 +news_society/news_law/statute,news_politics/other,news_society/news_law,news_society,news_politics|,|「第212期」重拳出击治“老赖”丨不开玩笑,拿下13个亿! +news_finance/investment/other,news_tech/other,news_politics/other,news_finance/investment,news_finance,news_tech,news_politics|,|中国品牌创新发展工程推荐企业 +news_food/other,news_food|,|新人报到 | 15分钟“一人食”营养早餐,怎么做? +news_fashion/dressing,news_entertainment/vulgar,news_fashion|,|泫雅新MV将BRA外穿得太性感差点被禁 +news_tech/other,news_tech|,|奥迪、乐视、丰田要让汽车VR从虚拟到现实,还有几道坎? +news_sports/tennis,news_sports|,|与费纳德同处一个时代的不幸,他最有话说 +emotion/marriage/other,emotion/affection/other,emotion/marriage,emotion/affection,emotion|,|小姑子帮我带大儿子,二十年后她找我借3万,我给她转账30万 +news_entertainment/hk_taiwan_entertainment,news_entertainment|,|因为一句爱国话在台湾疯狂掉粉 为什么还有人去骂他不爱国? +news_health/other,news_health|,|业内动态 | ERAS理念下的术后镇痛研讨会纪实总结 +news|,|吸烟有害健康,戒不了,怎么办?让茶帮你 +news_entertainment/west_entertainment/other,news_entertainment/west_entertainment,news_entertainment|,|《哈利波特》“斯内普教授”艾伦·里克曼去世 +news_society/news_law/other,news_house/house_renting,news_society/news_law,news_society|,|“119”警示丨三小场所、出租屋火灾案例警示 +digital/cellphone/huawei_phone,digital/operating_system/Android,news_tech/software,digital/operating_system,digital/hardware,digital/cellphone,digital,news_tech|,|国内首发安卓7.0 华为Mate 9之EMUI 5.0体验 +news_sports/olympic_games,news_sports/ping_pong,news_sports|,|她是国乒第一个在奥运会让球的选手,让球后选择退役并远嫁韩国 +news_entertainment/other,news_entertainment|,|49岁翁虹全家近照,富豪老公为生儿子改名,女儿可爱文静像美妈 +digital/cellphone/other,digital/operating_system/Android,news/pandian,digital/cellphone,digital|,|手机也疯狂!盘点那些打破世界纪录的手机 +emotion/marriage/other,emotion/affection/other,emotion/marriage,emotion/affection,emotion/jitang,emotion/low_grage_emotion,emotion|,|当妻子重要的不是有漂亮脸蛋,重要的是你必须有智慧 +news_home/decoration/furniture,news_home/decoration,news_home|,|简单装修怎么搭配实木家具?这几个案例请收下 +news_history/chinese_history/ancient_chinese_history/other,news_history/chinese_history/ancient_chinese_history,news_history/chinese_history,news_history|,|短暂存在的三川郡,影响中国两千年的三件事 +news_society/other,news_society|,|注意啦!高速上遇到这事,千万别停车!别开车门!别开窗! +digital/coolplay/other,digital/cellphone/other,digital/computer_peripheral/mechanical_keyboard,digital/coolplay,digital/cellphone,digital|,|装逼族的“大宝剑”之花式秀键盘 +news_society/civil/other,news_society/civil,government,news_society|,|成安交警大队2016年中秋小长假期间两公布一提示 +news_travel/self_driving_tour,news_travel|,|我国唯一融山海湖为一体的秘境 +news|,|谁家少得了书桌?不过这么美的你肯定没见过 +news|,|独家|陈灼昊案判决书全文 +news_military/weaponry/other,news_edu/art_education/painting,news_military/weaponry,news_military|,|致歉···弄错国产54手枪和仿制原版的苏联TT-33手枪 +news_entertainment,news|,|考试作弊是门艺术~但是谁用过这些奇葩方法? +news_collect/plaything,news_collect|,|蜜蜡附有色膜是什么鬼? +news_pet/other,news_pet|,|老爷爷救了两受伤狐狸,可当它们伤好了准备放走时,却发生了这事 +digital/cellphone/other,digital/hardware,digital/cellphone,digital|,|打造极致Hi-Fi影视手机 国产品牌谁也不敢与vivo为敌 +news|,|父母的位置摆不正,子孙就会很危险! +emotion/affection/other,emotion/marriage/extramarital_affair,emotion/marriage,emotion/affection,emotion/low_grage_emotion,emotion|,|激情过后——请给你的婚外情披上合法的外衣 +news_society/other,news_society|,|鸡蛋如何装进油桶?今天全面解析 +news_sports/basketball/NBA,news_sports/basketball,news_sports|,|他是03年NBA选秀三杰最大苦主,防守如铜墙铁壁一般! +government,news_society/civil/other,news_society/civil,news_society|,|[余票信息]快关注、早出手,春运之路更顺心 +digital/coolplay/wearable_devices,digital/other,news_tech/internet_cellphone,news_tech,digital|,|小米生态链近期发狂!?1MORE、90分近日又有新品问世! +news_culture/other,news_food/tea,news_culture|,|品精典布朗山金瓜,鉴一片茶叶里多少春秋! +news_tech,news_food/wine_drinking/other,news_finance/entrepreneurship/other,news_food/wine_drinking,news_finance/entrepreneurship,news|,|程序员创业首先学会喝酒! +news_fashion/cosmetology/body_care/other,news_fashion/cosmetology/body_care,news_fashion/other,news/pandian,news_fashion|,|经常按摩疏通淋巴,还可以瘦脸!明星们都这么做! +news|,|数学建模思想|从现实对象到数学模型! +news_edu/edu_upgrade/english_language,news_edu/other,news_edu|,|1706四六级全科备考计划 +news_culture/art/calligraphy,news_edu/art_education/painting,news_collect/other,news_culture/art,news_culture,news_collect|,|3.3一元拍卖会︱名家书画专场 +digital/cellphone/other,digital/photography/camera,digital/cellphone,digital|,|这也能叫黑科技?双镜头cool1 dual笑而不语 +news_car/other,news_car|,|4S店猫腻之选装! +news_history|,|毛主席曾经特赦过七次战犯,为何没有特赦汪精卫的夫人陈璧君? +news_car/sports_car,news|,|这款车只卖给中国人,太霸道了! +digital/coolplay/virtual_reality,digital/hardware,news_tech/artificial_intelligence,technique/natural_language_processing,news_tech|,|自然裸手ARVR体验 uSens凌感正式发布Fingo +news_entertainment/gossip,news_entertainment|,|近期娱乐圈的四大重磅新闻 一条刷新三观两条让人心痛不愿相信! +news_health/regimen/other,news_health/medical_news/other,news_health/cancer/other,news_health/cancer,news_health/regimen,news_health/medical_news,news_health|,|精品原创︱大肠癌肝转移患者“能活多久”? +news|,|最爱二次元美女丨少女前线美图 +news_fashion/dressing/other,news_fashion/dressing,news_fashion|,|2016最火的吊带衫搭配方案,全在这了! +news_finance/investment/stock/other,news_finance/investment/stock,news_agriculture/planting,news_finance/investment,news_agriculture,news_finance|,|尿素 暴涨背后的4个理由 及后市预测! +lottery/other,lottery|,|大乐透16139期关注凤尾28、29 不要错过38亿奖池! +news_finance/investment/stock/other,news_finance/investment/stock,news_finance/investment,news_finance|,|宇辉战舰盘前点睛20160711 +news_baby/baby_growth/baby_parenting/other,news_food/menu/other,news_baby/baby_growth/baby_parenting,news_baby/baby_nurturing/baby_nursing,news_baby/baby_nurturing,news_food/menu,news_baby/baby_growth,news_food,news_baby,news_psychology|,|半大孩子喜欢的几个家常菜 +news_photography|,|归零归零 丨 更新预告 采集神秘白色体液的小护士 5月20日 +news_food/dessert,news_food|,|美得如此丧心病狂的甜点,是想让我吃还是不吃呢~ +news_fashion/dressing/other,funny/marvel,news_fashion/dressing,news_fashion,funny|,|被路边小哥拉进理发店,进去之后竟然是这样的! +news_car/car_usage/other,news_car/huohua_car_usage,news_car/car_usage,news_car|,|开车三年半,有一样东西该换了,可许多温州人却不知道! +news_finance/entrepreneurship/other,news_tech/maker,news_finance/entrepreneurship,news_finance,news_tech|,|创客100第十五期开放日以前,他用报道把副部级官员送进监狱,今天,他用内容“勾引”读者 +news_tech/other,news_tech|,|熟记这11个最常用PS快捷键 早日成为专业设计师 +news_sports/football/football_world/football_england,news_sports/football/football_world,news_sports/football,news_sports|,|足球风云汇:收官献礼,这场大球不可错过 +news_astrology/numerology/chinese_zodiac,news_culture/other,news_culture|,|第一轮生肖贺岁纪念币价值连城,你见过哪个? +news_sports/football/football_world/other,news_sports/football/football_world,news_sports/football,news_sports|,|足坛15大终身无缘世界杯的巨星,谁最遗憾 +news_society/other,news_politics/other,news_military/other,news_society,news_politics,news_military|,|淄博中心城区公安武警联勤武装巡逻启动 +news_food/menu/other,news_food/menu,news_food|,|凉拌猪心:几道小动作“除腥增鲜”做出优质下酒菜! +science_all/animal/snake,science_all/other,science_all|,|蛇来啦!简单讲下曼巴属的区分! +news_baby/pregnancy/pregnancying,news_baby/other,news_baby|,|中国唐氏儿成了美国网红,唐筛不合格,你愿意把胎儿生下来吗? +news_tech/internet/e-business/other,news_tech/internet/e-business,news_agriculture/farming,news_tech/internet,news_agriculture,news_tech|,|冷链学术|冷链物流基础上的生鲜电商发展研究 +news_health/sexual_health/women_health/gynecology,news_health/sexual_health/women_health,news_health/sexual_health,news_health|,|关于子宫内膜异位症,你应该先看看这一篇! +news_society/news_law/other,news_politics/politics_law,news_society/news_law,government,news_society|,|临漳交警执勤捡手机 归还失主受称赞 +news_history/chinese_history/ancient_chinese_history/other,news_history/chinese_history/ancient_chinese_history,news_history/chinese_history,news_history|,|赵匡胤一脉能重新获得江山,靠得竟是一个女人的梦! +news_history/chinese_history/ancient_chinese_history/other,news_history/chinese_history/ancient_chinese_history,news_history/chinese_history,news_history|,|秦国年仅12岁的奇才,吕不韦不到的事他办到了 +news_culture/reading/poetry,news|,|网友最有情怀20首诗词:我有一壶酒,可以慰风尘 +news_home/decoration/furniture,news_home/decoration,news_home|,|装修老司机给出的实用小窍门,句句戳心!总有几点是你所忽略掉的 +news|,|浙江仙居上演荧光炫彩夜跑 +news_military/weaponry/other,news_military/weaponry,news_military/military_china,news_military|,|MP5冲锋枪如此成功,适合中国警察吗?其实有远比它更好的选择 +news_society/news_law/statute,news_society/news_law,government,news_society|,|法庭启动联动司法 诉前成功调解家庭纠纷 +funny/other,funny|,|哈哈哈哈哈!最强买家秀来袭,双十一防剁手全靠它了! +news_baby/baby_growth/baby_parenting/other,news_baby/baby_growth/baby_parenting,news_baby/baby_growth,news_baby|,|教孩子学会情绪管理!成才,情商占80%,智商仅占20% +news_edu/edu_upgrade/other,news_edu/edu_upgrade,news_edu|,|兴趣是最好的老师|小芸老师开导学生软硬兼施智慧满满 +news_health/regimen/other,news_health/cancer/other,news_health/cancer,news/pandian,news_health/regimen,news_health|,|全球十大垃圾食品,吃了只会增加负担! +news_society/other,news_society|,|加拿大3名男警察性侵“警花” 或利用“失身酒” +news_society/civil/other,news_politics/politics_law,news_society/civil,news_society|,|中秋节期间玉山县道路出行提示 +news_home/floriculture,news|,|花箱盆栽合理利用花园空间 +news_history/chinese_history/ancient_chinese_history/other,news_history/chinese_history/ancient_chinese_history,news_culture/other,news_history/chinese_history,news_culture|,|微观茶市丨顾绍培师生紫砂作品展东莞启幕 +digital/cellphone/other,digital/hardware,digital/cellphone,digital|,|小米5s:真正的强大,总是由内而外 +news_pet/dogs,news_pet|,|二哈陷入自恋中无法自拔,每天都照镜子欣赏自己,这幕让人笑喷 +news_home/decoration/other,news_home/decoration,news_home|,|头一次见水电安装地面这样开槽,楼下邻居知道了定会“拼命”的! +news_tech,news_edu/online_education,news|,|网易有道CEO周枫:在线教育的风暴之眼 +news_tech/internet/mobile_internet/other,news_finance/investment/stock/computer_industry,news_finance/financing/other,news_tech/internet/mobile_internet,news_finance/business_management/marketing,news_finance/financing,news_tech/internet,news_finance,news_tech|,|每日e报|京东去年为员工缴纳五险一金超27亿元 传乐视金融欲收购数码视讯支付牌照 +news_society/news_law/other,news_society/news_law,news_society|,|冒充烟草专卖局工作人员 女子专骗开便利店老人 +news_history/military_history/anti_japanese_war,emotion/affection/love,news_entertainment|,|《毕业歌》热血与激情铸就不灭青春 +news_food/tea,emotion/jitang,news_food|,|夏至夜,一锅幸福的茶鸡蛋 | 松鼠的厨房 +news|,|一根有斑点的香蕉竟然这么厉害?绝对长知识! +news_baby/pregnancy/pregnancying,news_baby/pregnancy,news_baby|,|与胎儿做游戏不是天方夜谭!踢肚游戏,可培养出健壮、灵敏的宝宝 +news_tech,news_finance/business_management/marketing,news|,|Annie岳:被忽略的分类信息网站营销 +news_fashion/fashion_man/mens_clothing,news_fashion/dressing/shoes,news_fashion/dressing,news_fashion|,|想要问问你敢不敢,像我这样穿运动服走红毯 +news_history/military_history/anti_japanese_war,news_history/other,news_history|,|他屡败日寇后日寇以厚禄诱降 被他断然拒绝 朱德赞其太行屏障 +news_culture/reading/literature/other,news_culture/art/other,news_culture/reading/literature,news_culture/reading/poetry,news_culture/art,news_culture/reading,news_culture|,|诗歌文学——凤鸣新春 +news|,|小雪丨天渐寒,雪渐盛,又是一年将尽时 +news_travel/other,news_travel|,|赏红叶—郎山红叶忆英雄 +news_health/regimen/diet_therapy,news_health/traditional_chinese_medicine/chinese_herbology,news_health/regimen/healthy_eating,news_health/traditional_chinese_medicine,news_health/regimen,news_health|,|那些药膳靠谱么? +news_history,news_culture/reading/literature/ancient_poetry,news_culture/reading/literature/other,news_culture/reading/literature,news_culture/traditional_chinese/other,news_culture/traditional_chinese,news_society/other,news_society,news_culture|,|国庆长假探亲,亲戚间到底为何攀比? +digital/cellphone/other,digital/cellphone,news_tech/internet_cellphone,digital,news_tech|,|每1.1秒卖出一台?2016年OPPO为何销量暴涨 +news_travel/other,news_travel|,|原来“世外桃源”真的存在! +news_history/military_history/anti_japanese_war,news_history/chinese_history/modern_history_china,news_history/other,news_history|,|缅怀12.9|关于12.9,请收下这些! +news_finance,news|,|新书《京东》即将上市:为什么是京东? +news_food/menu/chinese_food,news_food/catering_industry,news_food/cooking_skill,news_food|,|刘忠明收徒好威风! +news|,|全北京向上看:双彩虹配火烧云 一半彩虹一半火焰 +news_tech/other,news_travel/travel_industry,news_travel,news_tech|,|新趋势:超三成网友愿在航空公司购买出境游产品 +news_sports/other,news_sports|,|中断运动两星期会怎样? +news|,|春季患上心理感冒怎么办? +news_culture/art/handicraft,news_culture/art,news_culture|,|上周无锡灵山梵宫失火的廊厅里到底藏有哪些绝世珍宝? +news_society/civil/environmental_protection/environmental_pollution,news_society/civil/environmental_protection/air_pollution,news_society/civil/environmental_protection,news_society/civil/conserve_energy_emission,news_finance/other,news_finance|,|本想做个倒爷,不料搞出世界500强,24年不上市,他为啥牛? +news_history/other,news_history|,|他是李广的孙子,以五千步兵力战匈奴八万大军 +news_entertainment/film_tv/movie/euro_and_us_movie,news_entertainment/west_entertainment/hollywood,news_entertainment/film_tv/movie,news_entertainment/film_tv,news_entertainment|,|能看到这两位“黑帮教父”同台对垒,是多少影迷的夙愿 +news_fashion/fashion_man/mens_clothing,news_fashion/dressing/shoes,news_fashion/fashion_man,news_fashion|,|你真以为你看全了GUCCI男装秀?太天真了! +news_car/new_energy_car/electric_car,news_car/luxury_car,news_car/car_industry,news_car/german_car,news_car|,|增速要高于高端车市场 奔驰如何押宝中国? +news_edu/other,news_tech/software,news_edu|,|不会做的题,找“题谷”看老师视频讲解啊! +news_comic/blood_cartoon,news|,|母亲节,你老妈请求添加你为好友 +news_society/weather/other,news_society/weather,news_society|,|烟台今起3天都有阵雨,气温仍较高,今天最高温31℃! +news_food/menu/other,news_food/menu,news_food|,|用最简单的方法炒出最好吃的土豆丝,一上桌抢着吃 +news_home/other,news_home|,|新型环保背景墙,敬请欣赏 +news_tech,news|,|双十一战火已燃,京东拒绝刷单出奇招 +news_sports/fight/boxing,news_sports/fight/integrated_combat,news_sports/fight,news_sports|,|邹市明被高估?拳迷:年轻5岁打伦龙满地找牙 KO播求不是梦! +technique,science_all,news_world,news_travel,news_tech,news_sports,news_society,news_politics,news_pet,news_military,news_house,news_home,news_history,news_health,news_game,news_food,news_finance,news_fashion,news_entertainment,news_edu,news_culture,news_comic,news_car,news_baby,news_astrology,lottery,immigration,funny,fashion_wedding,emotion,digital,buddhism|,|花卉瓶插的理想水温是多少 +news_tech,news|,|国内资本为何蜂拥投资直播行业? +news_society/other,news_society|,|事关3700000宝鸡人的好消息,距离2018年还 +news_tech/internet/mobile_internet/other,news_finance/entrepreneurship/other,news_tech/internet/mobile_internet,news_finance/entrepreneurship,news_finance|,|推出伙伴创业计划,滴滴到底想要干什么? +news_tech/internet/mobile_internet/other,news_tech/internet/mobile_internet,news_tech/other,news_media,news_tech|,|活在朋友圈的自拍一代,把整个世界都当成了背景! +news_photography|,|藤椅上小憩的长腿美少女,笑容清纯甜美,身姿窈窕,极品女神! +news_fashion/cosmetology/make_up/other,news_fashion/cosmetology/beautify/skin_care,news_fashion/cosmetology/beautify,news_fashion/cosmetology/make_up,news_fashion/other,news_fashion|,|冒着被海关叔叔扣下也要带回的药妆在这里(附香邂格蕾获奖名单) +news_food/snack,news_food|,|今日立冬,北京的吃货们该进补了 +news_photography|,|美翻了泉州夜景,你见过了吗? +news_culture/other,emotion/jitang,news_culture|,|国馆丨总要争输赢的人,没有未来 +emotion/jitang,news_psychology|,|测一下你的社交性格? +news_history/chinese_history/ancient_chinese_history/other,news_history/chinese_history/ancient_chinese_history,news_history/chinese_history,news_history|,|康熙皇帝为什么是中国历史上最了解西方科学技术的帝王 +news_finance/investment/other,news_society/civil/social_insurance,news_finance/financing/insurance,news_finance/investment,news_society/civil,news_finance|,|医保到底有没有用?有什么用? +news/pandian,news_comic/japan_cartoon,news_comic/manhua,news_comic|,|打破主角光环!解析路飞四档 +news_society/other,news_society|,|抢订退伍兵在行动,你被抢了吗? +news_sports/basketball/NBA,news_sports/basketball,news_sports|,|由飞踹下体引发的溃败,更加证明格林才是勇士胜负的关键手! +news_politics/politics_law,news_society/other,news_society|,|临漳交警春节前夕开展夜查统一行动 +digital,news|,|让家里wifi信号增强10倍,只要一个它! +news_home/other,news_agriculture/animal_husbandry,news_agriculture,news_home|,|原本让人头疼的水葫芦,你却在家养的让人羡慕! +news_food/seafood,news|,|据说这款被称为史上最强的饵料能把红虫都打闭口? +news_finance/investment/stock/usa_stock,news_finance/financing/other,news_finance/macro_economic/macro_economic_china,news_finance/financing,news_finance/macro_economic,news_finance|,|国际评定机构下调中国信用主权评级?不必太在意 +news_food/menu,news_food|,|日本豆腐的9种不同做法合集 +news_comic/manhua/other,news/pandian,news_comic/manhua,news_comic|,|细思密恐|漫画|细数人类创造的那些恐怖发明 +news_politics,news_house,news_finance|,|两会初探:代表委员们如何给房地产建言献策? +news_food/other,news_food|,|栗蘑美食系列之三“栗蘑炖鸡” +news_tech/internet/e-business/other,news_finance/business_management/marketing,news_tech/internet/e-business,news_tech/other,news/singles_day,news_edu/philosophy,news_tech|,|好物哲学+内容营销 京东超市赢在双11 +news_astrology/other,news_astrology|,|星座女神:本周运势(0704—0710) +digital/coolplay/wearable_devices,digital/coolplay,digital|,|Basslet手环让乐迷在外也能感受低音炮的震撼 +news_sports/football/football_china/china_league,news_sports/football/football_china,news_sports/other,news_sports|,|卡纳瓦罗有能力带领天津权健冲超?我不太相信 +news_finance/investment/foreign_exchange,news_finance/other,news_finance|,|人民币中间价大跌超400点,海外资产配置势在必行 +news_entertainment/gossip,news_entertainment|,|出轨、把妹、各取所需?这些明星的人设为何说崩就崩 +news_house/other,news/pandian,news_house|,|翻开2016 : 看看升龙集团成功的八大关键词 +news_entertainment/film_tv/movie/other,news_entertainment/film_tv/movie,news_entertainment/film_tv,news_pet/cats,news_entertainment|,|《九条命》,毛裤先生笑傲三甲 +news_baby/baby_growth/other,news_baby/baby_nurturing/baby_nursing,news_baby/baby_nurturing,news_baby/baby_growth,news_baby|,|宝宝说话晚是“贵人语迟”?说话晚的孩子智商高吗? +news_sports/basketball/NBA,news_sports/other,news_sports|,|连续两个赛季18次三双威少再成历史第一,胡子兄弟爆双熊成砥柱 +news_entertainment/film_tv/movie/horror_movie,news_entertainment/film_tv/movie,news_entertainment/film_tv,news_entertainment|,|不超过1000个人看过这部有点像《前目的地》的悬疑恐怖片 diff --git a/keras_textclassification/data/byte_multi_news/valid.csv b/keras_textclassification/data/byte_multi_news/valid.csv new file mode 100644 index 0000000..e7552c5 --- /dev/null +++ b/keras_textclassification/data/byte_multi_news/valid.csv @@ -0,0 +1,98 @@ +label|,|ques +news_tech/internet/internet_finance/p2p_finance,news_finance/investment/finance_management,news_finance/other,news_finance|,|行业整顿后的P2P理财会变成什么样?|愉见财经 +emotion/jitang,news|,|老婆:今天小年,你好吗 +news_astrology/other,news_astrology|,|测试 | 流言蜚语能伤你多少 +news_finance/investment/stock/usa_stock,news|,|"朵女郎"登陆纽约时代广场纳斯达克大屏为中国健儿加油 +news_entertainment/other,news_entertainment|,|贝克汉姆虐布鲁克林千百回,却始终待小七如初恋 +news_history/chinese_history/ancient_chinese_history/other,news_history/chinese_history/ancient_chinese_history,news_history/chinese_history,news_history|,|明朝的庚戌之变究竟是谁之过? +news_health/other,news_health|,|手术中医生想要“套套”,护士却不解其意 +news_essay|,|无忧之人老练,竟然也有烦心之事 +digital/cellphone/other,digital/cellphone,digital|,|小米有史以来下架最快的手机,你有买过么? +news_world/other,news_world|,|美国总统不是特朗普,而是班农 +news_edu/edu_upgrade/college_entrance_examination,news_edu/edu_upgrade,news_edu|,|沈阳一考点挂霸气条幅 刷爆朋友圈 +news_local,emotion/jitang,news|,|几年的放纵,换来的是一生卑微,致济南所有青春里的孩子! +news_comic/manhua/other,news_comic/manhua,news_society/other,news_society|,|怎样成为一个人见人爱的奇男子?「颜值篇」 +news_game/mobile_game/other,emotion/affection/other,emotion/affection,emotion/jitang,news_game/mobile_game,emotion,news_game|,|感情不是游戏,谁也伤不起 +news_society/civil/environmental_protection/air_pollution,news_society/civil/environmental_protection/environmental_pollution,news_society/civil/environmental_protection,news_society/other,news_society|,|全球污染城市排名出炉,面对严重雾霾三岁以下儿童家长怎么应对! +news_baby/pregnancy/other,news_baby/pregnancy,news_baby|,|我的暖男天使 +emotion,news|,|吃雪饼没变旺被举报:生活里的奇葩为什么多了起来? +news_society/other,news_society|,|如果你不了解这个词经常出入会有危险! +emotion/marriage/other,emotion/marriage,news_essay|,|休闲空间你以为你是谁? +news_tech,digital|,|出门访友 异地也能轻松下载 +news_baby/baby_growth/baby_parenting/pre_education,news_tech/other,technique/cloud_computing,news_edu/family_education,news_tech|,|探访成都明星创业公司之铁皮人:智能玩具不能取代亲子教育 +news_tech/internet/other,news_tech/internet,news_tech|,|巨人雅虎没有失败!英雄迟暮的故事还有很多,因为大家喜欢听 +news_edu/edu_upgrade/chinese_language,news_culture/reading/poetry,news|,|高中语文诗词鉴赏四步教学法 +news_culture/art/traditional_opera,news/pandian,news_entertainment/drama,news_culture/art,news_culture|,|盘点2016年蒲剧大事记 +news_society/other,news_society|,|惊艳!武汉首辆有轨电车试跑,真的快通车了 +funny/other,funny|,|妹妹对姐姐说:姐今天蚊子好多哦~ +news_society,news|,|黄梅|龙感湖飞鸟翔集 +news|,|“最初一公里”是国产水果提高市场竞争力的关键 +news_fashion/dressing/other,news_home/decoration/home_design,news_home/other,news_design/other,news_fashion/dressing,news_design,news_home,news_fashion|,|2016年度流行色搭配,9款客厅设计势不可挡! +news_politics/politics_law,news_society/other,news_society|,|学习消防安全常识二十条 +news_health/body_building/other,news_health/body_building,news_health/slimming,news_health|,|健身千万别忽略小腿的锻炼6种方式增强小腿力量减缓腿部衰老 +news_sports/basketball/other,news/pandian,news_sports/basketball,news_sports|,|第一大前走好,盘点邓肯和其他顶级大前的对位。 +news_society/other,news_society|,|寻找“我身边的工匠”手机诵读大赛 3000元大奖邀您参加 +news_car/car_usage/other,news_car/huohua_car_usage,news_car/car_usage,news_car|,|汽车天窗7个使用小技巧 +news,news_home/other,news_home|,|火龙果小森林制作日记及教程 +news_edu/edu_upgrade/adult_education,news_politics/other,news_edu/other,news_edu,news_politics|,|马克思主义学院在职研修班10月23日截止报名! +news_history/world_history/tomb,news_history/archaeology,news_culture/historical_relic,news_history|,|一个灵异的陵墓,墓道刻有一句诅咒,千百年来,盗墓贼均无一生还 +news_history,news|,|狠太子弑父皇,在宫中大开杀戒,下令剖开父皇爱妃的胸膛 +news_health/cancer/breast_cancer,news_health/cancer,news_health/health_soft,news_health|,|女人刮腋毛会不会惹乳腺癌? +news_military/navy/warship,news_military/weaponry/aircraft_carrier,news_entertainment/other,news_military/navy,news_military/weaponry,news_entertainment|,|感动!一个海军航空兵的士兵突击 +digital/coolplay/smart_home/other,digital/coolplay/smart_home,digital/coolplay,digital|,|一款可以远程遥控的开关----COOLTOUCH无线智能开关 +news,video_funny|,|央视财经频道就静宁县苹果产业进行专访 +news_tech/other,digital/hardware,news_tech|,|跨界化反:乐视“413生态共享之夜”是一道怎样的化学方程式 +emotion/marriage/other,news_society/other,news_baby/other,emotion/marriage,news_baby,news_society|,|连续三胎生下脑瘫孩子,心力交瘁的夫妻俩得知原因后陷入绝望 +emotion/jitang,news_essay|,|我们都坐在没有月光的屋子里 +news_tech/internet/mobile_internet/other,news_tech/internet/mobile_internet,news_tech/other,news_tech|,|专车新政落地,神州、优步、滴滴谁最受益? +news_travel/other,news_travel|,|天下第一油菜花 +news_food/menu/other,news_food/seafood,news_food/menu,news_food|,|男人最爱的下酒菜!这么烧,鲜香味美,一盘铁定不够吃 +news_fashion/cosmetology/body_care/other,news_health/body_building/other,news_fashion/cosmetology/body_care,news_health/body_building,news_health/slimming,news_health|,|牛掰的是:已经有肌肉的人仍在塑形脱脂变型男,而你仍是个喷子. +funny,news|,|62岁老人曾4小时玩108次跳楼机 +emotion/marriage/other,emotion/affection/love,emotion/affection,emotion/marriage,emotion/jitang,emotion|,|“如果不是对的人,无论追多久也不会换来爱情的” +digital/operating_system/Android,digital/pad,digital/hardware,news_tech/software,digital|,|联想:安卓平板还是有市场的 +news_food/menu/other,news_food/menu,news_food|,|告诉你,我家的肉饼都是这样煎出来的 +news_food/other,news_food|,|味道 | 大文豪元好问曾用它祭雁,这到底是什么美味? +news_food/other,news_food|,|我想和你做件温暖又健康的事!吃小火锅,你请! +news_pet/other,news_pet|,|故事:为了泡妞打赌,傻缺穿着红裤衩去河边撵狗,中邪差点跳了河 +news_food/wine_drinking/grape_wine,news_food/wine_drinking,news|,|怎样做葡萄酒,来自酒厂的秘方教你自制红酒 +news_food,news|,|我们驱车20多公里,就为了这一顿魂牵梦萦的牛肉! +news_politics/politics_law,news_society/other,government,news_society|,|校车安全检查 +news_fashion/cosmetology/beautify/skin_care,news_health/regimen/other,news_health/daily_health,news_health/regimen,news_health|,|嫩手炸弹!洗碗布上的细菌竟是带有40亿个活细菌!伤手又伤身! +emotion/affection/other,emotion/affection,emotion/low_grage_emotion,emotion/jitang,emotion|,|男人的爱,女人永远觉得不够 +news_baby,news|,|6月8日海洋日阅读推荐:这是我们的海洋,我们的责任 +news_society/news_law/statute,news_politics/other,news_society/news_law,news_society,news_politics|,|夜深了,法院会议室的灯还亮着······ +news_entertainment/film_tv/movie/film_festival,news_entertainment/other,news_entertainment|,|第三届丝路电影节形象大使成龙获奥斯卡终身成就奖 +news_society/other,news_society|,|肯陪伴、悦成长,从“心”关爱留守儿童 +digital/coolplay/virtual_reality,news_tech/other,digital/coolplay,news_tech,digital|,|排名第一遥遥领先?三星Gear VR占到七成以上 +news_society/civil/philanthropy,news_edu/edu_upgrade/college,news_edu/other,news_edu|,|陈天桥捐1.15亿美元背后:为啥中国富豪喜欢给美国大学捐款? +digital/coolplay/other,digital/headset,digital/hardware,digital/coolplay,digital|,|iPhone7音质救星——3款千元级便携耳放推荐 +news_pet/cats,news_pet|,|猫咪躲水沟盖下,七十多岁老奶奶怕它饿坏,趴地上塞饲料,太感人 +news_history/other,news/pandian,news_history|,|揭秘朱元璋杀人:众大臣装疯卖傻才能免于一死 +news_tech/internet/other,news_tech/internet,news_tech|,|一天纳税1亿不过是小目标,马云的大目标是要做“英雄联盟” +news_world/diplomacy/middle_east,news_military/military_world,news_world/diplomacy,news_world,news_military|,|俄罗斯不惧拼刺刀 贴近北约部署核导弹 美国使“阴招”应对 +news_fashion/dressing,news_fashion|,|穿好boyfriend jeans,时尚变身没商量 +news_culture/traditional_chinese/other,news_politics/other,news_culture/traditional_chinese,news_agriculture/countryside,news_culture,news_politics|,|每年3000万扶持文化旅游发展,还首设孟子文化奖……邹城这是给全国人民送大礼 +news_baby/baby_growth,news_baby|,|看哭了无数妈妈的断奶故事 +news_society/other,news_politics/other,news_society,news_politics|,|海西消防力保元宵节期间全州消防安全形势稳定 +news_entertainment|,|蒋勤勤18岁出道青涩照美貌赛天仙(图) +emotion/affection/other,emotion/affection,emotion/low_grage_emotion,emotion|,|老公让她人流七次,爱爱时避孕到底是谁的任务? +news_comic/japan_cartoon/other,news_comic/japan_cartoon,news_comic/blood_cartoon,news_comic|,|没有这个齐神看真的是要死了,幸好续作制作决定来得像龙卷风 +news_astrology/other,news_astrology|,|星座女神:本周运势(10.17—10.23) +news_military/other,news_military|,|盘点各种“异型”枪,你所不知道的伪装枪 +news_car/car_exhibition,news_car/south_korean_car,news_car/car_new_arrival,news_car|,|或成起亚最快量产车?Stinger亮相北美车展! +news_health/other,news_health|,|8个动作帮你 瘦腿!瘦腰!瘦手臂,一个月大变身! +news_game/mobile_game/other,news_fashion/fashion_man/other,news_fashion/fashion_man,news_game/mobile_game,news_society/positive,news_fashion,news_game|,|吉田羊 × Acne Studios +news_tech/internet/mobile_internet/other,news_tech/internet/e-business/O2O,news_tech/internet/mobile_internet,news_tech/internet,news_tech|,|美团与点评合并,这是一个时代的开始?也许是一个时代的结束 +science_all/other,science_all|,|男子翻修房子时,发现神秘地下室,真是稀奇! +news_tech/software,digital/cellphone,digital,news_tech|,|iPhone花样式刷机教程 +news_sports/swimming,news_sports|,|中国赚钱最多的运动员,第一原来是她 +digital/appliances/small_home_appliance,news_health/other,news_health|,|提醒!这种杯子不能用来喝水,赶紧换掉,修武、武陟、焦作人家里都有! +news_entertainment/film_tv/movie/other,news_entertainment/film_tv/movie,news_entertainment/film_tv,news_entertainment|,|实力派黄渤将加盟《西游记:女儿国》 +news_sports/olympic_games/other,news_entertainment/other,news_sports/ping_pong,news_sports/olympic_games,news_sports,news_entertainment|,|张继科成大忙人 元宵晚会他将演唱这首老歌 你还记得当初的她吗 +news_society/news_law/statute,news_society/news_law,news_society|,|阆中开展夏季治安清查 一批涉案人员被拘留 +digital/other,digital|,|小蚁智能行车记录仪实测画面曝光,值得期待! +news_culture/reading/poetry,news|,|诗词闲读2天净沙·秋 +news|,|慧保周报(2016年第22周) +digital/cellphone/other,digital/hardware,digital/cellphone,news_tech/internet_cellphone,digital,news_tech|,|MTK的芯片能不能配得上高端手机 +news_entertainment/film_tv/movie/chinese_movie,news_entertainment/film_tv/movie/horror_movie,news_entertainment/film_tv/movie,news_entertainment/film_tv,news_entertainment|,|她做了半个世纪的配角演员,成香港电影传奇,却一生单身无人伴 diff --git a/keras_textclassification/data_preprocess/text_preprocess.py b/keras_textclassification/data_preprocess/text_preprocess.py index 6c4d470..f4a36cb 100644 --- a/keras_textclassification/data_preprocess/text_preprocess.py +++ b/keras_textclassification/data_preprocess/text_preprocess.py @@ -18,6 +18,48 @@ import re import os + +def txt_read(file_path, encode_type='utf-8'): + """ + 读取txt文件,默认utf8格式 + :param file_path: str, 文件路径 + :param encode_type: str, 编码格式 + :return: list + """ + list_line = [] + try: + file = open(file_path, 'r', encoding=encode_type) + while True: + line = file.readline() + line = line.strip() + if not line: + break + list_line.append(line) + file.close() + except Exception as e: + print(str(e)) + finally: + return list_line + + +def txt_write(list_line, file_path, type='w', encode_type='utf-8'): + """ + txt写入list文件 + :param listLine:list, list文件,写入要带"\n" + :param filePath:str, 写入文件的路径 + :param type: str, 写入类型, w, a等 + :param encode_type: + :return: + """ + try: + file = open(file_path, type, encoding=encode_type) + file.writelines(list_line) + file.close() + + except Exception as e: + print(str(e)) + + def extract_chinese(text): """ 只提取出中文、字母和数字 @@ -178,3 +220,141 @@ class PreprocessText: random.shuffle(indexs) x_, y_ = x_[indexs], y_[indexs] return x_, y_ + + + + +def transform_multilabel_to_multihot(sample, label=1070): + """ + + :param sample: [1, 2, 3, 4] + :param label: 1022 + :return: [1, 0, 1, 1, ......] + """ + result = np.zeros(label) + result[sample] = 1 + res = result.tolist() + # res = ''.join([str(r) for r in res]) + return res + + +class PreprocessTextMulti: + """ + 数据预处理, 输入为csv格式, [label,ques] + """ + def __init__(self): + self.l2i_i2l = None + if os.path.exists(path_fast_text_model_l2i_i2l): + self.l2i_i2l = load_json(path_fast_text_model_l2i_i2l) + + def prereocess_idx(self, pred): + if os.path.exists(path_fast_text_model_l2i_i2l): + pred_i2l = {} + i2l = self.l2i_i2l['i2l'] + for i in range(len(pred)): + pred_i2l[i2l[str(i)]] = pred[i] + pred_i2l_rank = [sorted(pred_i2l.items(), key=lambda k: k[1], reverse=True)] + return pred_i2l_rank + else: + raise RuntimeError("path_fast_text_model_label2index is None") + + def prereocess_pred_xid(self, pred): + if os.path.exists(path_fast_text_model_l2i_i2l): + pred_l2i = {} + l2i = self.l2i_i2l['l2i'] + for i in range(len(pred)): + pred_l2i[pred[i]] = l2i[pred[i]] + pred_l2i_rank = [sorted(pred_l2i.items(), key=lambda k: k[1], reverse=True)] + return pred_l2i_rank + else: + raise RuntimeError("path_fast_text_model_label2index is None") + + def preprocess_label_ques_to_idx(self, embedding_type, path, embed, rate=1, shuffle=True): + if type(path) == str: + label_ques = txt_read(path) + ques = list() + label = list() + for lq in label_ques[1:]: + lqs = lq.split('|,|') + ques.append(lqs[1]) + label.append(lqs[0]) + elif type(path) == list and ',' in path[0]: + label = [label_ques.split(',')[0] for label_ques in path] + ques = [label_ques.split(',')[1] for label_ques in path] + else: + raise RuntimeError('type of path is not true!') + + len_ql = int(rate * len(ques)) + if len_ql <= 50: # 数量较少时候全取, 不管rate + len_ql = len(ques) + ques = ques[: len_ql] + label = label[: len_ql] + print('rate ok!') + + ques = [str(q).strip().upper() for q in ques] + + # label = [[l_ for l_ in str(l).upper().split(',')] for l in label] + # 获取单个标签, 如 ['news,tips', 'news,news_tech']转化为['news','tips','news_tech'] + # label_list = [] + # for l in label: + # label_single = str(l).strip().upper().split(',') + # label_list = label_list + label_single + # label_set = set(label_list) + # len_label_set = len(label_set) + # print(len_label_set) + + from keras_textclassification.conf.path_config import path_byte_multi_news_label + byte_multi_news_label = txt_read(path_byte_multi_news_label) + byte_multi_news_label = [i.strip().upper() for i in byte_multi_news_label] + label_set = set(byte_multi_news_label) + len_label_set = len(label_set) + # 保存标签类标,数字文本转化 + count = 0 + label2index = {} + index2label = {} + for label_one in label_set: + label2index[label_one] = count + index2label[count] = label_one + count = count + 1 + l2i_i2l = {} + l2i_i2l['l2i'] = label2index + l2i_i2l['i2l'] = index2label + save_json(l2i_i2l, path_fast_text_model_l2i_i2l) + + + x = [] + for que in ques: + que_embed = embed.sentence2idx(que) + x.append(que_embed) # [[], ] + + print('que_embed ok!') + + # 转化为多标签类标 + label_multi_list = [] + count = 0 + for l in label: + count += 1 + label_single = str(l).strip().upper().split(',') + label_single_index = [l2i_i2l['l2i'][ls] for ls in label_single] + label_multi = transform_multilabel_to_multihot(label_single_index, label=len_label_set) + label_multi_list.append(label_multi) + + print('label_multi_list ok!') + if embedding_type == 'bert': + x_, y_ = np.array(x), np.array(label_multi_list) + if shuffle: + indexs = [ids for ids in range(len(y_))] + random.shuffle(indexs) + x_, y_ = x_[indexs], y_[indexs] + x_1 = np.array([x[0] for x in x_]) + x_2 = np.array([x[1] for x in x_]) + x_all = [x_1, x_2] + return x_all, y_ + else: + x_, y_ = np.array(x), np.array(label_multi_list) + if shuffle: + indexs = [ids for ids in range(len(y_))] + random.shuffle(indexs) + x_, y_ = x_[indexs], y_[indexs] + return x_, y_ + diff --git a/keras_textclassification/m03_CharCNN/graph_zhang.py b/keras_textclassification/m03_CharCNN/graph_zhang.py index 55560dd..21b802b 100644 --- a/keras_textclassification/m03_CharCNN/graph_zhang.py +++ b/keras_textclassification/m03_CharCNN/graph_zhang.py @@ -2,4 +2,55 @@ # !/usr/bin/python # @time :2019/6/12 14:32 # @author :Mo -# @function : +# @function :graph of charCNN-zhang +# @paper : Character-level Convolutional Networks for Text Classification(https://arxiv.org/pdf/1509.01626.pdf) + + +from __future__ import print_function, division + +# char cnn +from keras.layers import Convolution1D, MaxPooling1D, ThresholdedReLU +from keras.layers import Dense, Dropout, Flatten +from keras.models import Model + +from keras_textclassification.base.graph import graph + + +class CharCNNGraph(graph): + def __init__(self, hyper_parameters): + """ + 初始化 + :param hyper_parameters: json,超参 + """ + self.char_cnn_layers = hyper_parameters['model'].get('char_cnn_layers', + [[256, 7, 3], [256, 7, 3], [256, 3, -1], [256, 3, -1], [256, 3, -1], [256, 3, 3]],) + self.full_connect_layers = hyper_parameters['model'].get('full_connect_layers', [1024, 1024],) + self.threshold = hyper_parameters['model'].get('threshold', 1e-6) + super().__init__(hyper_parameters) + + def create_model(self, hyper_parameters): + """ + 构建神经网络 + :param hyper_parameters:json, hyper parameters of network + :return: tensor, moedl + """ + super().create_model(hyper_parameters) + x = self.word_embedding.output + # x = Reshape((self.len_max, self.embed_size, 1))(embedding_output) # (None, 50, 30, 1) + # cnn + pool + for char_cnn_size in self.char_cnn_layers: + x = Convolution1D(filters = char_cnn_size[0], + kernel_size = char_cnn_size[1],)(x) + x = ThresholdedReLU(self.threshold)(x) + if char_cnn_size[2] != -1: + x = MaxPooling1D(pool_size = char_cnn_size[2], + strides = 1)(x) + x = Flatten()(x) + # full-connect + for full in self.full_connect_layers: + x = Dense(units=full,)(x) + x = ThresholdedReLU(self.threshold)(x) + x = Dropout(self.dropout)(x) + output = Dense(units=self.label, activation=self.activate_classify)(x) + self.model = Model(inputs=self.word_embedding.input, outputs=output) + self.model.summary(120) \ No newline at end of file diff --git a/keras_textclassification/m03_CharCNN/train_zhang.py b/keras_textclassification/m03_CharCNN/train_zhang.py new file mode 100644 index 0000000..cfaf503 --- /dev/null +++ b/keras_textclassification/m03_CharCNN/train_zhang.py @@ -0,0 +1,100 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/8/14 11:08 +# @author :Mo +# @function :train of CharCNNGraph_kim with baidu-qa-2019 in question title + + +# 适配linux +import pathlib +import sys +import os +project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent) +sys.path.append(project_path) +# 地址 +from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters +# 训练验证数据地址 +from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid +# 数据预处理, 删除文件目录下文件 +from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, delete_file +# 模型图 +from keras_textclassification.m03_CharCNN.graph_zhang import CharCNNGraph as Graph +# 计算时间 +import time + + +def train(hyper_parameters=None, rate=1.0): + """ + 训练函数 + :param hyper_parameters: json, 超参数 + :param rate: 比率, 抽出rate比率语料取训练 + :return: None + """ + if not hyper_parameters: + hyper_parameters = { + 'len_max': 50, # 句子最大长度, 固定 推荐20-50 + 'embed_size': 300, # 字/词向量维度 + 'vocab_size': 20000, # 这里随便填的,会根据代码里修改 + 'trainable': True, # embedding是静态的还是动态的, 即控制可不可以微调 + 'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word' + 'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'random'、 'bert' or 'word2vec" + 'gpu_memory_fraction': 0.66, #gpu使用率 + 'model': {'label': 17, # 类别数 + 'batch_size': 32, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大 + 'filters': [2, 3, 4, 5], # 卷积核尺寸 + 'filters_num': 300, # 卷积个数 text-cnn:300-600 + 'channel_size': 1, # CNN通道数 + 'dropout': 0.5, # 随机失活, 概率 + 'decay_step': 100, # 学习率衰减step, 每N个step衰减一次 + 'decay_rate': 0.9, # 学习率衰减系数, 乘法 + 'epochs': 20, # 训练最大轮次 + 'patience': 10, # 早停,2-3就好 + 'lr': 1e-3, # 学习率, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数 + 'l2': 1e-9, # l2正则化 + 'activate_classify': 'softmax', # 最后一个layer, 即分类激活函数 + 'loss': 'categorical_crossentropy', # 损失函数 + 'metrics': 'accuracy', # 保存更好模型的评价标准 + 'is_training': True, # 训练后者是测试模型 + 'model_path': path_model, + # 模型地址, loss降低则保存的依据, save_best_only=True, save_weights_only=True + 'path_hyper_parameters': path_hyper_parameters, # 模型(包括embedding),超参数地址, + 'path_fineture': path_fineture, # 保存embedding trainable地址, 例如字向量、词向量、bert向量等 + # only charCNN_zhang + 'threshold': 1e-6, + 'char_cnn_layers': [[256, 7, 3], [256, 7, 3], [256, 3, -1], [256, 3, -1], [256, 3, -1], [256, 3, 3]], # small + # [[1024, 7, 3], [1024, 7, 3], [1024, 3, -1], [1024, 3, -1], [1024, 3, -1], [1024, 3, 3]], # large + 'full_connect_layers': [1024, 1024], # [2048, 2048], large + + }, + 'embedding': {'layer_indexes': [12], # bert取的层数, + # 'corpus_path': '', # embedding预训练数据地址,不配则会默认取conf里边默认的地址 + }, + 'data':{'train_data': path_baidu_qa_2019_train, # 训练数据 + 'val_data': path_baidu_qa_2019_valid # 验证数据 + }, + } + + # 删除先前存在的模型\embedding微调模型等 + delete_file(path_model_dir) + time_start = time.time() + # graph初始化 + graph = Graph(hyper_parameters) + print("graph init ok!") + ra_ed = graph.word_embedding + # 数据预处理 + pt = PreprocessText() + x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'], + hyper_parameters['data']['train_data'], + ra_ed, rate=rate, shuffle=True) + x_val, y_val = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'], + hyper_parameters['data']['val_data'], + ra_ed, rate=rate, shuffle=True) + print("data propress ok!") + print(len(y_train)) + # 训练 + graph.fit(x_train, y_train, x_val, y_val) + print("耗时:" + str(time.time()-time_start)) + + +if __name__=="__main__": + train(rate=0.01) # sample条件下设为1,否则训练语料可能会很少 diff --git a/keras_textclassification/data/model/fast_text/__init__.py b/test/multi_class/__init__.py similarity index 71% rename from keras_textclassification/data/model/fast_text/__init__.py rename to test/multi_class/__init__.py index a1f6a11..4d53acf 100644 --- a/keras_textclassification/data/model/fast_text/__init__.py +++ b/test/multi_class/__init__.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- # !/usr/bin/python -# @time :2019/6/3 10:50 +# @time :2019/8/14 21:23 # @author :Mo # @function : \ No newline at end of file diff --git a/test/multi_class/predict_multi.py b/test/multi_class/predict_multi.py new file mode 100644 index 0000000..6dd9f6d --- /dev/null +++ b/test/multi_class/predict_multi.py @@ -0,0 +1,84 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/8/14 17:40 +# @author :Mo +# @function : + + +# 适配linux +import pathlib +import sys +import os +project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent) +sys.path.append(project_path) +# 地址 +from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters +# 训练验证数据地址 +from keras_textclassification.conf.path_config import path_byte_multi_news_valid, path_byte_multi_news_train +# 数据预处理, 删除文件目录下文件 +from keras_textclassification.data_preprocess.text_preprocess import PreprocessTextMulti, read_and_process, load_json +# 模型图 +from keras_textclassification.m02_TextCNN.graph import TextCNNGraph as Graph +# 模型评估 +from sklearn.metrics import classification_report +# 计算时间 +import time + +import numpy as np + + +def pred_input(path_hyper_parameter=path_hyper_parameters): + # 输入预测 + # 加载超参数 + hyper_parameters = load_json(path_hyper_parameter) + pt = PreprocessTextMulti() + # 模式初始化和加载 + graph = Graph(hyper_parameters) + graph.load_model() + ra_ed = graph.word_embedding + ques = '我要打王者荣耀' + # str to token + ques_embed = ra_ed.sentence2idx(ques) + if hyper_parameters['embedding_type'] == 'bert': + x_val_1 = np.array([ques_embed[0]]) + x_val_2 = np.array([ques_embed[1]]) + x_val = [x_val_1, x_val_2] + else: + x_val = ques_embed + # 预测 + pred = graph.predict(x_val) + print(pred) + # 取id to label and pred + pre = pt.prereocess_idx(pred[0]) + ls_nulti = [] + for ls in pre[0]: + if ls[1] >= 0.1: + ls_nulti.append(ls) + print(pre[0]) + print(ls_nulti) + while True: + print("请输入: ") + ques = input() + ques_embed = ra_ed.sentence2idx(ques) + print(ques_embed) + if hyper_parameters['embedding_type'] == 'bert': + x_val_1 = np.array([ques_embed[0]]) + x_val_2 = np.array([ques_embed[1]]) + x_val = [x_val_1, x_val_2] + else: + x_val = ques_embed + pred = graph.predict(x_val) + pre = pt.prereocess_idx(pred[0]) + ls_nulti = [] + for ls in pre[0]: + if ls[1] >= 0.1: + ls_nulti.append(ls) + print(pre[0]) + print(ls_nulti) + +if __name__=="__main__": + # 测试集预测 + # pred_tet(path_test=path_byte_multi_news_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少 + + # 可输入 input 预测 + pred_input() diff --git a/test/multi_class/train_multi.py b/test/multi_class/train_multi.py new file mode 100644 index 0000000..0978aee --- /dev/null +++ b/test/multi_class/train_multi.py @@ -0,0 +1,90 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/8/14 16:14 +# @author :Mo +# @function : + + +# 适配linux +import pathlib +import sys +import os +project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent) +sys.path.append(project_path) +# 地址 +from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters +# 训练验证数据地址 +from keras_textclassification.conf.path_config import path_byte_multi_news_train, path_byte_multi_news_valid +# 数据预处理, 删除文件目录下文件 +from keras_textclassification.data_preprocess.text_preprocess import PreprocessTextMulti, delete_file +# 模型图 +from keras_textclassification.m02_TextCNN.graph import TextCNNGraph as Graph +# 计算时间 +import time + + +def train(hyper_parameters=None, rate=1.0): + if not hyper_parameters: + hyper_parameters = { + 'len_max': 50, # 句子最大长度, 固定推荐20-50, bert越长会越慢, 占用空间也会变大, 本地win10-4G设为20就好, 过大小心OOM + 'embed_size': 300, # 字/词向量维度, bert取768, word取300, char可以更小些 + 'vocab_size': 20000, # 这里随便填的,会根据代码里修改 + 'trainable': True, # embedding是静态的还是动态的, 即控制可不可以微调 + 'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word', 注意:word2vec模式下训练语料要首先切好 + 'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'random'、 'bert' or 'word2vec" + 'gpu_memory_fraction': 0.66, #gpu使用率 + 'model': {'label': 1070, # 类别数 + 'batch_size': 32, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大 + 'dropout': 0.5, # 随机失活, 概率 + 'decay_step': 100, # 学习率衰减step, 每N个step衰减一次 + 'decay_rate': 0.9, # 学习率衰减系数, 乘法 + 'epochs': 20, # 训练最大轮次 + 'patience': 3, # 早停,2-3就好 + 'lr': 1e-3, # 学习率, bert取5e-5, 其他取1e-3, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数 + 'l2': 1e-9, # l2正则化 + 'activate_classify': 'softmax', # 最后一个layer, 即分类激活函数 + 'loss': 'categorical_crossentropy', # 损失函数, 可能有问题, 可以自己定义 + 'metrics': 'top_k_categorical_accuracy', # 1070个类, 太多了先用topk, 这里数据k设置为最大:33 + # 'metrics': 'categorical_accuracy', # 保存更好模型的评价标准 + 'is_training': True, # 训练后者是测试模型 + 'model_path': path_model, + # 模型地址, loss降低则保存的依据, save_best_only=True, save_weights_only=True + 'path_hyper_parameters': path_hyper_parameters, # 模型(包括embedding),超参数地址, + 'path_fineture': path_fineture, # 保存embedding trainable地址, 例如字向量、词向量、bert向量等 + }, + 'embedding': {'layer_indexes': [13], # bert取的层数 + # 'corpus_path': '', # embedding预训练数据地址,不配则会默认取conf里边默认的地址 + }, + 'data':{'train_data': path_byte_multi_news_train, # 训练数据 + 'val_data': path_byte_multi_news_valid, # 验证数据 + }, + } + + # 删除先前存在的模型和embedding微调模型等 + delete_file(path_model_dir) + time_start = time.time() + # graph初始化 + graph = Graph(hyper_parameters) + print("graph init ok!") + ra_ed = graph.word_embedding + # 数据预处理 + pt = PreprocessTextMulti() + x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'], + hyper_parameters['data']['train_data'], + ra_ed, rate=rate, shuffle=True) + print('train data propress ok!') + x_val, y_val = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'], + hyper_parameters['data']['val_data'], + ra_ed, rate=rate, shuffle=True) + print("data propress ok!") + print(len(y_train)) + # 训练 + graph.fit(x_train, y_train, x_val, y_val) + print("耗时:" + str(time.time()-time_start)) + + +if __name__=="__main__": + train(rate=0.01) + # 注意: 4G的080Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。 + # 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的 + # win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory