添加语料构建项目

This commit is contained in:
lhy_in_blcu@126.com 2018-11-28 17:23:37 +08:00
parent 119094761b
commit 06a34764d2
39 changed files with 73306 additions and 0 deletions

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="projectConfiguration" value="Nosetests" />
<option name="PROJECT_TEST_RUNNER" value="Nosetests" />
</component>
</module>

View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.3 (~/anaconda3/bin/python)" project-jdk-type="Python SDK" />
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/EventMonitor-master.iml" filepath="$PROJECT_DIR$/.idea/EventMonitor-master.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,340 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="fa8272ba-2814-489d-b9dc-db4d932a6359" name="Default" comment="" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileEditorManager">
<leaf>
<file leaf-file-name="process_redis.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/process_redis.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="684">
<caret line="38" column="11" lean-forward="true" selection-start-line="38" selection-start-column="11" selection-end-line="38" selection-end-column="11" />
<folding>
<element signature="e#14#26#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="news_spider.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/EventMonitor/spiders/news_spider.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="288">
<caret line="46" column="21" lean-forward="false" selection-start-line="46" selection-start-column="21" selection-end-line="46" selection-end-column="21" />
<folding>
<element signature="e#142#155#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="items.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/EventMonitor/items.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="90">
<caret line="5" column="13" lean-forward="true" selection-start-line="5" selection-start-column="13" selection-end-line="5" selection-end-column="13" />
<folding />
</state>
</provider>
</entry>
</file>
<file leaf-file-name="pipelines.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/EventMonitor/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="414">
<caret line="23" column="20" lean-forward="false" selection-start-line="23" selection-start-column="20" selection-end-line="23" selection-end-column="20" />
<folding>
<element signature="e#194#203#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>prin</find>
</findStrings>
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/EventMonitor/settings.py" />
<option value="$PROJECT_DIR$/EventMonitor/pipelines.py" />
<option value="$PROJECT_DIR$/EventMonitor/spiders/news_spider.py" />
<option value="$PROJECT_DIR$/process_redis.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds" extendedState="7">
<option name="y" value="-7" />
<option name="width" value="1366" />
<option name="height" value="732" />
</component>
<component name="ProjectView">
<navigator currentView="ProjectPane" proportions="" version="1">
<flattenPackages />
<showMembers />
<showModules />
<showLibraryContents />
<hideEmptyPackages />
<abbreviatePackageNames />
<autoscrollToSource />
<autoscrollFromSource />
<sortByType />
<manualOrder />
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<expand>
<path>
<item name="EventMonitor-master" type="b2602c69:ProjectViewProjectNode" />
<item name="EventMonitor-master" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="EventMonitor-master" type="b2602c69:ProjectViewProjectNode" />
<item name="EventMonitor-master" type="462c0819:PsiDirectoryNode" />
<item name="EventMonitor" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
</pane>
<pane id="Scratches" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$/EventMonitor/spiders" />
<recent name="$PROJECT_DIR$/EventMonitor" />
</key>
<key name="MoveFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$" />
<recent name="$PROJECT_DIR$/EventMonitor" />
<recent name="$PROJECT_DIR$/EventMonitor/spiders" />
</key>
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="RunManager" selected="Python.process_redis">
<configuration name="process_redis" type="PythonConfigurationType" factoryName="Python" temporary="true">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="EventMonitor-master" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/process_redis.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
</configuration>
<recent_temporary>
<list size="1">
<item index="0" class="java.lang.String" itemvalue="Python.process_redis" />
</list>
</recent_temporary>
</component>
<component name="ShelveChangesManager" show_recycled="false">
<option name="remove_strategy" value="false" />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="fa8272ba-2814-489d-b9dc-db4d932a6359" name="Default" comment="" />
<created>1543216322930</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1543216322930</updated>
</task>
<servers />
</component>
<component name="ToolWindowManager">
<frame x="0" y="-7" width="1366" height="732" extended-state="7" />
<editor active="true" />
<layout>
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.18887262" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
<window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.3296" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
<window_info id="Data View" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
<window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
<window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
</layout>
</component>
<component name="VcsContentAnnotationSettings">
<option name="myLimit" value="2678400000" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<option name="time" value="1" />
</breakpoint-manager>
<watches-manager />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/EventMonitor/crawl.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="90">
<caret line="5" column="1" lean-forward="false" selection-start-line="5" selection-start-column="1" selection-end-line="5" selection-end-column="1" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/rel_data.txt">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="198">
<caret line="11" column="0" lean-forward="false" selection-start-line="11" selection-start-column="0" selection-end-line="11" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/EventMonitor/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/EventMonitor/spiders/utils.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/EventMonitor/settings.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-380">
<caret line="44" column="137" lean-forward="false" selection-start-line="44" selection-start-column="137" selection-end-line="44" selection-end-column="137" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/EventMonitor/middlewares.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/EventMonitor/spiders/handle_html.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-852">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/rel_data.txt">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/EventMonitor/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="414">
<caret line="23" column="20" lean-forward="false" selection-start-line="23" selection-start-column="20" selection-end-line="23" selection-end-column="20" />
<folding>
<element signature="e#194#203#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/EventMonitor/items.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="90">
<caret line="5" column="13" lean-forward="true" selection-start-line="5" selection-start-column="13" selection-end-line="5" selection-end-column="13" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/EventMonitor/spiders/rel_data.txt">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/EventMonitor/spiders/news_spider.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="288">
<caret line="46" column="21" lean-forward="false" selection-start-line="46" selection-start-column="21" selection-end-line="46" selection-end-column="21" />
<folding>
<element signature="e#142#155#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/process_redis.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="684">
<caret line="38" column="11" lean-forward="true" selection-start-line="38" selection-start-column="11" selection-end-line="38" selection-end-column="11" />
<folding>
<element signature="e#14#26#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
</project>

View File

Binary file not shown.

View File

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class EventmonitorItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
keyword = scrapy.Field()
news_url = scrapy.Field()
news_time = scrapy.Field()
news_date = scrapy.Field()
news_title = scrapy.Field()
news_content = scrapy.Field()

View File

@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class EventmonitorSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class EventmonitorDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

View File

@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
import pymongo
class EventmonitorPipeline(object):
def __init__(self):
CUR = '/'.join(os.path.abspath(__file__).split('/')[:-2])
self.news_path = os.path.join(CUR, 'news')
if not os.path.exists(self.news_path):
os.makedirs(self.news_path)
conn = pymongo.MongoClient('192.168.1.37', 27017)
self.col = conn['person_rel_dataset']['docs']
'''处理采集资讯, 存储至Mongodb数据库'''
def process_item(self, item, spider):
try:
self.col.insert(dict(item))
except (pymongo.errors.WriteError, KeyError) as err:
pass
# raise DropItem("Duplicated Item: {}".format(item['name']))
return item
# '''处理数据流'''
# def process_item(self, item, spider):
# print(item)
# keyword = item['keyword']
# event_path = os.path.join(self.news_path, keyword)
# if not os.path.exists(event_path):
# os.makedirs(event_path)
# filename = os.path.join(event_path, item['news_date'] + '' + item['news_title'])
# self.save_localfile(filename, item['news_title'], item['news_time'], item['news_content'])
# return item
#
# '''将内容保存至文件当中'''
# def save_localfile(self, filename, title, pubtime, content):
# with open(filename, 'w+') as f:
# f.write('标题:{0}\n'.format(title))
# f.write('发布时间:{0}\n'.format(pubtime))
# f.write('正文:{0}\n'.format(content))
# f.close()

View File

@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
# Scrapy settings for EventMonitor project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'EventMonitor'
SPIDER_MODULES = ['EventMonitor.spiders']
NEWSPIDER_MODULE = 'EventMonitor.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'EventMonitor (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'EventMonitor.middlewares.EventmonitorSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'EventMonitor.middlewares.EventmonitorDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'EventMonitor.pipelines.EventmonitorPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

Binary file not shown.

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

Binary file not shown.

View File

@ -0,0 +1,318 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
from collections import Counter
from operator import itemgetter
import copy
from lxml import etree
from .handle_html import *
from .utils import *
class NewsParser:
def __init__(self):
self.score = 6
self.length = 5
def _cal_score(self, text):
"""计算兴趣度"""
if "" not in text:
if "" in text:
return 0
else:
return -1
else:
score = text.count('') + 1
score += text.count(',') + 1
score += text.count('')
score += text.count('')
return score
def _line_div(self, html):
html = re.sub("</?div>|</?table>", "</div><div>", html, flags=re.I)
html = html.replace('</div>', '', 1)
index = html.rfind('<div>')
html = html[:index] + html[index:].replace('<div>', '', 1)
return html
def _line_p(self, text):
text_list = list()
text = re.sub(r'</?p\s?.*?>', r'</p><p class="news_body">', text, flags=re.I | re.S)
text = text.replace('</p>', '', 1)
index = text.rfind('<p>')
text = text[:index] + text[index:].replace('<p>', '', 1)
text = '<p class="news_head">{0}</p>'.format(text)
return text
def _extract_paragraph(self, html):
cluster_para = {}
absorb_para = {}
for index, div_str in enumerate(re.findall("<div>(.*?)</div>", html, flags=re.S | re.I)):
if len(div_str.strip()) == 0:
continue
para_str = div_str.strip()
score = self._cal_score(para_str)
if score > self.score:
cluster_para[index] = [para_str, score]
else:
absorb_para[index] = [para_str, score]
return cluster_para, absorb_para
def _extract_feature(self, para_dict):
c = Counter()
index, text = max(para_dict.items(), key=lambda asd: asd[1][1])
feature_list = re.findall("(<p.*?>)", text[0], flags=re.I | re.S)
for feature in feature_list:
c[feature] += 1
if c.most_common(1):
feature, amount = c.most_common(1)[0]
else:
feature = ''
feature = feature.replace('(', '\(').replace(')', '\)')
return index, feature
def _gen_skeleton(self, para_dict, index, feature):
""" 聚类段落集聚类生成生成正文脉络集合"""
skeleton_dict = {}
num_list = []
if not feature:
skeleton_dict[index] = para_dict[index]
return skeleton_dict
for num in para_dict.keys():
num_list.append(num)
num_list = sorted(num_list)
od = num_list.index(index)
f_list = num_list[0:od]
l_list = num_list[od:len(num_list)]
# 向后聚类
while l_list:
tmp = l_list.pop(0)
length = abs(tmp - index)
if length < self.length:
if re.match(r".*?{0}".format(feature), para_dict[tmp][0], flags=re.S | re.I):
skeleton_dict[tmp] = para_dict[tmp]
index = tmp
# 向前聚类
while f_list:
tmp = f_list.pop()
length = abs(index - tmp)
if length < self.length:
if re.match(r".*?{0}".format(feature), para_dict[tmp][0], flags=re.S):
skeleton_dict[tmp] = para_dict[tmp]
index = tmp
return skeleton_dict
def _absorb_text(self, skeleton_dict, para_dict):
"""从伪噪声段落吸收噪声段落"""
content_dict = skeleton_dict
sk_list = skeleton_dict.keys()
pa_list = para_dict.keys()
sk_list = sorted(sk_list)
pa_list = sorted(pa_list)
heads = []
middle = []
tail = []
for each in pa_list:
if each < sk_list[0]:
heads.append(each)
if each > sk_list[-1]:
tail.append(each)
if (each >= sk_list[0]) and (each <= sk_list[-1]):
middle.append(each)
while heads:
tmp = heads.pop()
index = sk_list[0]
if abs(tmp - index) < self.length:
if para_dict[tmp][1] * 2 > self.score:
content_dict[tmp] = para_dict[tmp]
else:
break
while tail:
tmp = tail.pop(0)
index = sk_list[-1]
if abs(tmp - index) < self.length:
if para_dict[tmp][1] * 2 > self.score:
content_dict[tmp] = para_dict[tmp]
else:
break
while middle:
tmp = middle.pop()
if para_dict[tmp][1] * 2 > self.score:
content_dict[tmp] = para_dict[tmp]
return content_dict
def _substring(self, text):
text = self._line_p(text)
text = pretty_html(text)
selector = etree.HTML(text)
xpath_result = selector.xpath('//p')
if len(xpath_result) == 1:
sub_string = xpath_result[0].xpath('string(.)')
sub_string = drop_mutil_br(sub_string)
else:
text_list = []
xpath_result = selector.xpath('//p[@class="news_body"]')
for item in xpath_result:
p_string = item.xpath('string(.)').strip()
if not p_string:
continue
p_string = drop_null(p_string)
text_list.append(p_string)
if text_list:
sub_string = '\n'.join(text_list)
else:
sub_string = ''
return sub_string
def _pretty_text(self, index_content_list):
contents = list()
for each in index_content_list:
sub_text = self._substring(each[1][0])
if not sub_text:
continue
else:
contents.append(sub_text)
text = "\n".join(contents)
return text
def extract_news(self, html):
html = handle_html(html)
html = self._line_div(html)
index = 0
cluster_para, absorb_para = self._extract_paragraph(html)
if cluster_para:
index, feature = self._extract_feature(cluster_para)
skeleton_dict = self._gen_skeleton(cluster_para, index, feature)
if skeleton_dict:
if absorb_para:
content_dict = self._absorb_text(skeleton_dict, absorb_para)
else:
content_dict = skeleton_dict
index_content_list = sorted(content_dict.items(), key=itemgetter(0))
top_div_list = list()
top_text = ''
index = index_content_list[0][0]
for ind, each_div in enumerate(re.findall("<div>(.*?)</div>", html, flags=re.S)):
if ind >= index:
break
top_text += each_div
top_div_list.append((ind, each_div))
else:
return
'''正文抽取'''
def extract_content():
text = ''
if index_content_list:
text = self._pretty_text(index_content_list)
text = text.strip()
return text
'''发布时间抽取'''
def extract_pubtime():
pubtime = ''
tmp_top_div_list = copy.deepcopy(top_div_list)
while tmp_top_div_list:
ind, item = tmp_top_div_list.pop()
if not item.strip():
continue
div_selector = etree.HTML(item)
if div_selector is None:
continue
div_text = div_selector.xpath('string(.)').strip()
if not div_text:
continue
pubtime = re.search(r'(\d{4}\s*[年\-:/]\s*)\d{1,2}\s*[月\-/]\s*\d{1,2}\s*[\-_:日]?\s*\d{1,2}\s*:\s*\d{1,2}\s*(:\s*\d{1,2})?', div_text, flags=re.S|re.I)
if pubtime:
pubtime = pubtime.group()
index = ind
break
if not pubtime:
tmp_top_div_list = copy.deepcopy(top_div_list)
while tmp_top_div_list:
ind, item = tmp_top_div_list.pop()
if not item.strip():
continue
div_selector = etree.HTML(item)
if div_selector is None:
continue
div_text = div_selector.xpath('string(.)')
pubtime = re.search(r'(\d{4}\s*[年\-:/]\s*)\d{1,2}\s*[月\-/]\s*\d{1,2}\s*[\-_:日/]?', div_text,
flags=re.S)
if pubtime:
pubtime = pubtime.group()
index = ind
break
if pubtime:
pubtime = pubtime.strip()
pubtime = pubtime.replace('', '-').replace('', '-').replace('', ' ').replace('/', '-')
pubtime = drop_mutil_blank(pubtime)
return pubtime, index
else:
return pubtime, 0
'''标题抽取'''
def extract_title():
title = ''
selector = etree.HTML(html)
tmps = selector.xpath('//title/text()')
if tmps:
title = tmps[0].strip()
title = clear_title(title)
return title
news = {}
news_content = extract_content()
news_pubtime, index = extract_pubtime()
news_title = extract_title()
news['news_content'] = news_content
news['news_pubtime'] = self.pretty_time(news_pubtime)
if news['news_pubtime']:
news['news_date'] = news['news_pubtime'].split(' ')[0]
else:
news['news_date'] = ''
news['news_title'] = news_title
if not (news['news_content'] and news['news_pubtime'] and news['news_title'] and news['news_date']):
return {}
return news
'''时间标准化'''
def pretty_time(self, time):
if not time:
return None
modify_time = time
if len(time.split(' ')) == 2:
date = modify_time.split(' ')[0]
hour = modify_time.split(' ')[1]
date_new = self.pretty_date(date)
modify_time = ' '.join([date_new, hour])
else:
date = modify_time.split(' ')[0]
modify_time = self.pretty_date(date)
return modify_time
'''标准化年月日'''
def pretty_date(self, date):
date = date.split('-')
if len(date) != 3:
return ''
year = date[0]
month = date[1]
day = date[2]
if int(month) < 10 and len(month) == 1:
month = '0' + month
if int(day) < 10 and len(day) == 1:
day = '0' + day
date_new = '-'.join([year, month, day])
return date_new

Binary file not shown.

View File

@ -0,0 +1,74 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: chenhe<hee0624@163.com>
# time: 2017-11-30
# version: 1.0
from html.parser import HTMLParser
from bs4 import BeautifulSoup
class StripParser(HTMLParser):
"""
去除一些特定的标签
"""
def __init__(self):
self.reset()
self.strict = False
self.convert_charrefs = True
self.drop_tags = {'script', 'style', 'iframe', 'aside', 'nav', 'footer'}
self.fed = []
self.point_tags =[]
self.is_fed = True
def handle_starttag(self, tag, attrs):
if tag in self.drop_tags:
self.is_fed = False
self.point_tags.append(tag)
else:
if tag == 'p':
tmp_attrs = ['{0}="{1}"'.format(i[0], i[1]) for i in attrs]
tmp_attrs = ' '.join(tmp_attrs)
self.fed.append('<p {}>'.format(tmp_attrs))
else:
self.fed.append('<{}>'.format(tag))
def handle_data(self, data):
if self.is_fed:
self.fed.append(data)
def handle_endtag(self, tag):
if tag in self.drop_tags:
if tag == self.point_tags[-1]:
self.point_tags.pop()
if not self.point_tags:
self.is_fed = True
else:
self.fed.append('</{}>'.format(tag))
def get_html(self):
return '\n'.join(self.fed)
def pretty_html(html):
soup = BeautifulSoup(html, 'html.parser')
fixed_html = soup.prettify()
return fixed_html
def strip_tag(html):
"""
去除html特定的标签
:param html: string
:return: string
"""
s = StripParser()
s.feed(html)
return s.get_html()
def handle_html(html):
html = pretty_html(html)
html = strip_tag(html)
return html

Binary file not shown.

View File

@ -0,0 +1,86 @@
#!/usr/bin/env python3
# coding: utf-8
# File: news_spider.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-7-15
import scrapy
import os
from lxml import etree
import urllib.request
from urllib.parse import quote, quote_plus
from .extract_news import *
from EventMonitor.items import EventmonitorItem
import redis
import os
class BuildData:
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.rel_filepath = os.path.join(cur, 'rel_data.txt')
self.seed_rels = self.collect_rels()
return
'''加载关系数据集'''
def collect_rels(self):
rels_data = []
for line in open(self.rel_filepath):
line = line.strip().split('###')
keywords = line[:-2]
rels_data.append(keywords)
return rels_data
class NewsSpider(scrapy.Spider):
name = 'eventspider'
def __init__(self):
self.seed_rels = BuildData().seed_rels
self.parser = NewsParser()
self.pool = redis.ConnectionPool(host='192.168.1.29', port=6379, decode_responses=True)
self.conn = redis.Redis(connection_pool=self.pool)
self.redis_key = 'person_names'
'''采集主函数'''
def start_requests(self):
while(1):
res = self.conn.spop(self.redis_key)
print(res)
if str(res) == 'None':
return
line = res.strip().split('###')
keywords = line[:-1]
search_body = '+'.join([quote_plus(wd) for wd in keywords[:-1]])
seed_urls = []
for page in range(0, 101, 20):
url = 'https://www.baidu.com/s?ie=utf-8&cl=2&rtt=1&bsst=1&tn=news&word=' + search_body + '&tngroupname=organic_news&pn=' + str(
page)
seed_urls.append(url)
for seed_url in seed_urls:
param = {'url': seed_url,
'keyword': ' '.join(keywords)}
yield scrapy.Request(url=seed_url, meta=param, callback=self.collect_newslist, dont_filter=True)
'''获取新闻列表'''
def collect_newslist(self, response):
selector = etree.HTML(response.text)
news_links = selector.xpath('//h3[@class="c-title"]/a/@href')
print(response.meta['keyword'], len(set(news_links)))
for news_link in news_links:
param = {'url': news_link,
'keyword': response.meta['keyword']}
yield scrapy.Request(url=news_link, meta=param, callback=self.page_parser, dont_filter=True)
'''对网站新闻进行结构化抽取'''
def page_parser(self, response):
data = self.parser.extract_news(response.text)
if data:
item = EventmonitorItem()
item['keyword'] = response.meta['keyword']
item['news_url'] = response.meta['url']
item['news_time'] = data['news_pubtime']
item['news_date'] = data['news_date']
item['news_title'] = data['news_title']
item['news_content'] = data['news_content']
yield item
return

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,123 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: chenhe<hee0624@163.com>
# time: 2017-11-30
# version: 1.0
from collections import Counter
import jieba.posseg as pseg
import re
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
if uchar >= '\u4e00' and uchar <= '\u9fa5':
return True
else:
return False
def is_number(uchar):
"""判断一个unicode是否是数字"""
if uchar >= u'\u0030' and uchar <= u'\u0039':
return True
else:
return False
def is_alphabet(uchar):
"""判断一个unicode是否是英文字母"""
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
return True
else:
return False
def is_legal(uchar):
"""判断是否非汉字,数字和英文字符"""
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
return False
else:
return True
def count_pos(str):
"""返回词性个数"""
pos_set = set()
words = pseg.cut(str)
for word, flag in words:
pos_set.add(flag)
return len(pos_set)
def is_longsent(str):
"""根据字符串汉字长度判断是否是标题"""
length = 0
for uchar in str:
if is_chinese(uchar):
length += 1
else:
pass
if length > 8:
return True
else:
return False
def clear_title(title_str):
seg_set = set(['\\', '\|', '/', '_'])
c = Counter()
for item in title_str:
if item in seg_set:
c[item] += 1
if c.most_common(1):
seg, count = c.most_common(1)[0]
else:
seg, count = '', 0
if seg:
title = title_str.split(seg)[0]
else:
title = title_str
title = title.replace('——', '-')
tmp = title.split('-')
is_continue = True
while is_continue:
if len(tmp) > 1:
top = tmp[-1]
pos_num = count_pos(top)
if pos_num > 2:
is_continue = False
else:
tmp.pop()
else:
is_continue = False
title = '-'.join(tmp).replace("\t", '')
return title
def clear_pan(str):
num = str.count('>')
if num >= 2:
return str.split('>')[-1]
else:
return str
def drop_null(arg):
if isinstance(arg, str):
arg = re.sub('\s', '', arg, flags=re.S)
return arg
elif isinstance(arg, list):
new_list = []
for i in arg:
i = i.strip()
if i:
new_list.append(i)
else:
continue
return new_list
else:
return arg
def drop_mutil_br(str):
str = re.sub(r'<br>|</br>', '\n', str)
str = re.sub(r'\n\s+', '\n', str)
return str
def drop_mutil_blank(str):
str = re.sub(r'\s{2,}', ' ', str)
return str

31
EventMonitor/README.md Normal file
View File

@ -0,0 +1,31 @@
# EventMonitor
Event monitor based on online news corpus built by Baidu search enginee using event keyword for event storyline and analysis基于给定事件关键词采集事件资讯对事件进行挖掘和分析。
# 项目路线图
![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/project.png)
# 项目细分
# 1) 基于话题关键词的话题历时语料库采集
执行方式进入EventMonitor目录下进入cmd窗口执行"scrapy crawl eventspider -a keyword=话题关键词"或者直接python crawl.py, 等待数秒后既可以在news文件夹中存储相应的新闻文件,可以得到相应事件的话题集,话题历史文本
![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/topic.png)
![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/news.png)
![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/content.png)
# 2)关于热点事件的情感分析
对于1)得到的历史语料,可以使用基于依存语义和情感词库的篇章级情感分析算法进行情感分析
这部分参考我的篇章级情感分析项目DocSentimentAnalysishttps://github.com/liuhuanyong/DocSentimentAnalysis
# 3)关于热点事件的搜索趋势
对于1)得到的历史语料,可以使用百度指数,新浪微博指数进行采集
这部分参考我的百度指数采集项目BaiduIndexSpyderhttps://github.com/liuhuanyong/BaiduIndexSpyder
微博指数采集项目WeiboIndexSpyderhttps://github.com/liuhuanyong/WeiboIndexSpyder
# 4)关于热点事件的话题分析
对于1)得到的历史语料可以使用LDA,Kmeans模型进行话题分析
这部分参考我的话题分析项目Topiclusterhttps://github.com/liuhuanyong/TopicCluster
# 5)关于热点事件的代表性文本分析
对于1)得到的历史语料可以使用跨篇章的textrank算法对文本集的重要性进行计算和排序
这部分参考我的文本重要性分析项目ImportantEventExtractorhttps://github.com/liuhuanyong/ImportantEventExtractor
# 6)关于热点事件新闻文本的图谱化展示
对于得到每个历史新闻事件文本,可以使用关键词,实体识别等关系抽取方法对文本进行可视化展示
这部分内容参考我的文本内容可视化项目项目TextGrapherhttps://github.com/liuhuanyong/TextGrapher
# 结束语
关于事件监测的方法有很多,也有很多问题需要去解决,以上提出的方法只是一个尝试,就算法本身还有许多需要改进的地方
If any question about the project or me ,see https://liuhuanyong.github.io/

5
EventMonitor/__init__.py Normal file
View File

@ -0,0 +1,5 @@
#!/usr/bin/env python3
# coding: utf-8
# File: __init__.py.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-7-15

View File

@ -0,0 +1,40 @@
#coding=utf-8
import redis
import os
class RedisProcess:
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.pool = redis.ConnectionPool(host='192.168.1.37', port=6379, decode_responses=True)
self.conn = redis.Redis(connection_pool=self.pool)
self.rel_filepath = os.path.join(cur, 'rel_data.txt')
return
def insert_data(self):
name = 'person_names'
i = 0
for line in open(self.rel_filepath):
i += 1
if i < 833:
continue
line = line.strip()
if not line or len(line.split('###')) != 4:
continue
self.conn.sadd(name, line)
print(i)
return
def read_data(self):
name = 'person_names'
res = 1
while(res):
res = self.conn.spop(name)
print(res)
return
if __name__ == '__main__':
handler = RedisProcess()
handler.insert_data()
# handler.read_data()

35996
EventMonitor/rel_data.txt Normal file

File diff suppressed because it is too large Load Diff

11
EventMonitor/scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = EventMonitor.settings
[deploy]
#url = http://localhost:6800/
project = EventMonitor