增加代理、延迟功能

This commit is contained in:
mxd 2019-07-24 11:07:56 +08:00
parent 13ea2099c4
commit d7797875e4
9 changed files with 263 additions and 141 deletions

View File

@ -1,16 +1,14 @@
package com.mxd.spider.core;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
@ -39,46 +37,61 @@ public class Spider {
public void run(SpiderFlow spiderFlow){
SpiderNode root = SpiderFlowUtils.loadXMLFromString(spiderFlow.getXml());
SpiderContext context = new SpiderContext();
int threadPoolSize = 8;
ThreadPoolExecutor pool = new ThreadPoolExecutor(threadPoolSize,threadPoolSize,5000,TimeUnit.MILLISECONDS,new LinkedBlockingQueue<Runnable>());
execute(pool,root, context,new HashMap<>());
execute(8,root, context,new HashMap<>());
}
public List<SpiderOutput> runWithTest(SpiderNode root,SpiderContext context){
//开始不允许设置任何东西
int threadPoolSize = 8;
ThreadPoolExecutor pool = new ThreadPoolExecutor(threadPoolSize,threadPoolSize,5000,TimeUnit.MILLISECONDS,new LinkedBlockingQueue<Runnable>());
execute(pool,root, context,new HashMap<>());
execute(8,root, context,new HashMap<>());
context.log("测试完毕!");
return context.getOutputs();
}
private void execute(ExecutorService threadPool,SpiderNode node,SpiderContext context,Map<String,Object> variables){
private void execute(int nThreads,SpiderNode node,SpiderContext context,Map<String,Object> variables){
ThreadPoolExecutor pool = new ThreadPoolExecutor(0, Integer.MAX_VALUE,60L, TimeUnit.SECONDS,new LinkedBlockingQueue<Runnable>());
LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>(nThreads);
executeNode(queue,node,context,variables);
for (int i = 0; i < nThreads; i++) {
pool.execute(()->{
try {
Runnable runnable = null;
while((runnable = queue.take()) != null){
runnable.run();
}
} catch (InterruptedException e) {
e.printStackTrace();
}
});
}
pool.shutdown();
//等待线程执行完毕
try {
while(!pool.awaitTermination(60, TimeUnit.SECONDS)){
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
private void executeaNextNodes(LinkedBlockingQueue<Runnable> queue,SpiderNode node,SpiderContext context,Map<String,Object> variables){
List<SpiderNode> nextNodes = node.getNextNodes();
if(nextNodes != null){
for (SpiderNode nextNode : nextNodes) {
executeNode(queue,nextNode,context,variables);
}
}
}
private void executeNode(LinkedBlockingQueue<Runnable> queue,SpiderNode node,SpiderContext context,Map<String,Object> variables){
if(!executeCondition(node,context,variables)){
return;
}
if(logger.isDebugEnabled()){
logger.debug("执行节点[{}:{}]",node.getNodeName(),node.getNodeId());
}
context.log(String.format("执行节点[%s:%s]", node.getNodeName(),node.getNodeId()));
if(executeCondition(node,context,variables)){
executeNode(threadPool,node,context,variables);
if(logger.isDebugEnabled()){
logger.debug("执行节点[{}:{}]完毕",node.getNodeName(),node.getNodeId());
}
context.log(String.format("执行节点[%s:%s]完毕", node.getNodeName(),node.getNodeId()));
}
}
private void executeaNextNodes(ExecutorService threadPool,SpiderNode node,SpiderContext context,Map<String,Object> variables){
List<SpiderNode> nextNodes = node.getNextNodes();
if(nextNodes != null){
for (SpiderNode nextNode : nextNodes) {
execute(threadPool,nextNode,context,variables);
}
}
}
private void executeNode(ExecutorService pool,SpiderNode node,SpiderContext context,Map<String,Object> variables){
int loopCount = 1;
if(node.getLoopCount() != null){
if(StringUtils.isNotEmpty(node.getLoopCount())){
Object result = engine.execute(node.getLoopCount(), variables);
if(result != null){
if(logger.isDebugEnabled()){
@ -88,30 +101,36 @@ public class Spider {
loopCount = ((Long)result).intValue();
}
}
List<Future<Map<String,Object>>> futures = new ArrayList<>();
if(loopCount > 0){
for (Executor executor : executors) {
if(executor.supportShape().equals(node.getShape())){
for (int i = 0; i < loopCount; i++) {
//存入下标变量
Map<String, Object> nVariables = Maps.add(variables, node.getLoopVariableName(), i);
futures.add(pool.submit(()->{
executor.execute(node, context,nVariables);
return nVariables;
}));
try {
queue.put(()->{
try {
executor.execute(node, context,nVariables);
} catch (Exception e) {
logger.error("执行节点[{}:{}]出错",node.getNodeName(),node.getNodeId(),e);
} finally{
if(logger.isDebugEnabled()){
logger.debug("执行节点[{}:{}]完毕",node.getNodeName(),node.getNodeId());
}
context.log(String.format("执行节点[%s:%s]完毕", node.getNodeName(),node.getNodeId()));
//递归执行下一级
CompletableFuture.runAsync(()->{
executeaNextNodes(queue, node, context, nVariables);
});
}
});
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
}
//递归执行下一级
for (Future<Map<String,Object>> future : futures) {
try {
executeaNextNodes(pool, node, context, future.get());
} catch (InterruptedException | ExecutionException e) {
}
}
}
private boolean executeCondition(SpiderNode node,SpiderContext context,Map<String,Object> variables){

View File

@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -34,6 +35,16 @@ public class RequestExecutor implements Executor{
@Override
public void execute(SpiderNode node, SpiderContext context, Map<String,Object> variables) {
String sleepCondition = node.getJsonProperty() == null ? null : node.getJsonProperty().getSleep();
if(StringUtils.isNotEmpty(sleepCondition)){
try {
Object value = engine.execute(sleepCondition, variables);
long sleepTime = ((Long)value).longValue();
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
}
}
HttpRequest request = HttpRequest.create();
SpiderJsonProperty property = node.getJsonProperty();
List<SpiderNameValue> parameters = property.getParameters();
@ -93,6 +104,24 @@ public class RequestExecutor implements Executor{
request.header(nameValue.getName(),value);
}
}
//设置代理
String proxy = property.getProxy();
if(proxy != null){
try {
proxy = engine.execute(proxy, variables).toString();
String[] proxyArr = proxy.split(":");
if(proxyArr != null && proxyArr.length == 2){
request.proxy(proxyArr[0], Integer.parseInt(proxyArr[1]));
context.log(String.format("设置代理:%s", proxy));
if(logger.isDebugEnabled()){
logger.debug("设置代理:{}",proxy);
}
}
} catch (Exception e) {
context.log("设置代理出错,异常信息:" + ExceptionUtils.getStackTrace(e));
logger.error("设置代理出错",e);
}
}
try {
HttpResponse response = request.execute();
//结果存入变量
@ -104,5 +133,5 @@ public class RequestExecutor implements Executor{
}
}
}

View File

@ -1,13 +1,15 @@
package com.mxd.spider.core.io;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
public class HttpRequest {
@ -19,6 +21,13 @@ public class HttpRequest {
private String method = "GET";
private Proxy proxy;
/**
* 超时时间
*/
private int timeout = 60000;
public static HttpRequest create(){
return new HttpRequest();
}
@ -46,7 +55,7 @@ public class HttpRequest {
public HttpRequest header(String key,Object value){
if(value != null){
return header(key,data.toString());
return header(key,value.toString());
}
return this;
}
@ -78,11 +87,18 @@ public class HttpRequest {
return this;
}
public HttpRequest proxy(String host,int port){
this.proxy = new Proxy(Proxy.Type.HTTP, InetSocketAddress.createUnresolved(host, port));
return this;
}
public HttpResponse execute() throws IOException{
Connection connection = Jsoup.connect(this.url);
connection.ignoreContentType(true);
connection.ignoreHttpErrors(true);
connection.method(Method.GET);
connection.maxBodySize(0);
connection.timeout(this.timeout);
if("POST".equals(this.method)){
connection.method(Method.POST);
}
@ -92,6 +108,9 @@ public class HttpRequest {
if(this.data != null){
connection.data(data);
}
if(this.proxy != null){
connection.proxy(proxy);
}
Response response = connection.execute();
return new HttpResponse(response);
}

View File

@ -10,6 +10,8 @@ public class SpiderJsonProperty{
private List<SpiderNameValue> variables;
private List<SpiderNameValue> outputs;
private String sleep;
/**
* 条件判断表达式
@ -36,6 +38,11 @@ public class SpiderJsonProperty{
private List<SpiderNameValue> parameters;
/**
* 代理
*/
private String proxy;
/*爬取参数--end*/
/*数据源参数--start*/
@ -196,14 +203,31 @@ public class SpiderJsonProperty{
public void setLoopVariableName(String loopVariableName) {
this.loopVariableName = loopVariableName;
}
public String getProxy() {
return proxy;
}
public void setProxy(String proxy) {
this.proxy = proxy;
}
public String getSleep() {
return sleep;
}
public void setSleep(String sleep) {
this.sleep = sleep;
}
@Override
public String toString() {
return "SpiderJsonProperty [shape=" + shape + ", variables=" + variables + ", outputs=" + outputs
+ ", condition=" + condition + ", loopCount=" + loopCount + ", loopVariableName=" + loopVariableName
+ ", url=" + url + ", method=" + method + ", headers=" + headers + ", parameters=" + parameters
+ ", datasourceType=" + datasourceType + ", datasourceUrl=" + datasourceUrl + ", datasourceUsername="
+ datasourceUsername + ", datasourcePassword=" + datasourcePassword + ", datasourceId=" + datasourceId
+ ", statementType=" + statementType + ", sql=" + sql + "]";
return "SpiderJsonProperty [shape=" + shape + ", variables=" + variables + ", outputs=" + outputs + ", sleep="
+ sleep + ", condition=" + condition + ", loopCount=" + loopCount + ", loopVariableName="
+ loopVariableName + ", url=" + url + ", method=" + method + ", headers=" + headers + ", parameters="
+ parameters + ", proxy=" + proxy + ", datasourceType=" + datasourceType + ", datasourceUrl="
+ datasourceUrl + ", datasourceUsername=" + datasourceUsername + ", datasourcePassword="
+ datasourcePassword + ", datasourceId=" + datasourceId + ", statementType=" + statementType + ", sql="
+ sql + "]";
}
}

View File

@ -2,7 +2,7 @@ server.port=8088
logging.level.root=INFO
spider.job.enable=true
spider.job.enable=false
spring.jackson.date-format=yyyy-MM-dd HH:mm:ss
spring.jackson.time-zone=GMT+8

View File

@ -17,6 +17,7 @@
<div class="toolbar-container">
<ul>
<li class="btn-return">返回</li>
<li class="btn-selectAll" title="Ctrl+A">全选</li>
<li class="btn-save" title="Ctrl+S">保存</li>
<li class="btn-test" title="Ctrl+Q">测试</li>
<li class="btn-console-xml">打印XML</li>

View File

@ -99,7 +99,18 @@ $(function(){
processCellEvent(graph.getSelectionCell(),graph);
});
//节点名称输入框事件
$("body").on("keyup",".editor-form-node .layui-form-item input[name=value]",function(){
$("body").on("mousewheel",".layui-tab .layui-tab-title",function(e,delta){
var $dom = $(this);
var wheel = e.originalEvent.wheelDelta || -e.originalEvent.detail;
var delta = Math.max(-1, Math.min(1, wheel) );
e.preventDefault = function(){}
if(delta > 0){
$dom.scrollLeft($dom.scrollLeft()-60);
}else{
$dom.scrollLeft($dom.scrollLeft()+60);
}
return false;
}).on("keyup",".editor-form-node .layui-form-item input[name=value]",function(){
var cell = graph.getSelectionCell();
if(cell != null){
var $input = $(this);
@ -117,17 +128,11 @@ $(function(){
}
}
}
}).on("keyup",".editor-form-node .layui-form-item input[name=spiderName]",function(){
}).on("keyup",".editor-form-node .layui-form-item input.input-default",function(){
var cell = graph.getModel().getRoot();
if(cell != null){
cell.data = cell.data || new JsonProperty();
cell.data.set('spiderName',$(this).val())
}
}).on("keyup",".editor-form-node .layui-form-item input[name=loopCount]",function(){
var cell = graph.getSelectionCell();
if(cell != null){
cell.data = cell.data || new JsonProperty();
cell.data.set('loopCount',$(this).val())
cell.data.set($(this).attr('name'),$(this).val())
}
}).on("keyup",".editor-form-node .layui-form-item input[name^=variable-]",function(){ //变量操作
resetFormArray(graph,'variable','variables');
@ -438,6 +443,8 @@ $(function(){
var graph = editor.graph;
$(".toolbar-container").on('click','.btn-delete',function(){
deleteSelectCells(graph);
}).on("click",".btn-selectAll",function(){
editor.execute('selectAll');
}).on('click',".btn-undo",function(){
editor.execute('undo');
}).on('click',".btn-redo",function(){
@ -521,7 +528,7 @@ $(function(){
location.href="spiderList.html"
}).on('click','.btn-save',function(){
Save();
});
})
}
function getXML(editor){
@ -556,7 +563,7 @@ $(function(){
editor.execute('copy');
});
keyHandler.bindControlKey(86,function(){ // Ctrl+V
editor.execute('paste')
editor.execute('paste');
});
keyHandler.bindControlKey(83,function(){ // Ctrl+S
Save();
@ -564,6 +571,9 @@ $(function(){
keyHandler.bindControlKey(81,function(){ // Ctrl+S
$(".btn-test").click();
});
keyHandler.bindControlKey(65,function(){ // Ctrl+A
editor.execute('selectAll');
});
}
function createWebSocket(options){

View File

@ -1,10 +1,13 @@
<div class="layui-tab layui-tab-fixed layui-tab-brief">
<ul class="layui-tab-title">
<li class="layui-this">配置</li>
<li class="layui-this">基本配置</li>
<li>参数</li>
<li>Header</li>
<li>代理</li>
</ul>
<div class="layui-tab-content">
<div class="layui-tab-content editor-form-node">
<div class="layui-tab-item layui-show">
<div class="layui-form editor-form-node">
<div class="layui-form">
<div class="layui-form-item">
<label class="layui-form-label">节点名称</label>
<div class="layui-input-block">
@ -14,19 +17,25 @@
<div class="layui-form-item">
<label class="layui-form-label">循环变量</label>
<div class="layui-input-block">
<input type="text" name="loopVariableName" placeholder="请输入循环变量" autocomplete="off" class="layui-input" value="{{(d.data.object.loopVariableName || '').replace('"','&quot;')}}">
<input type="text" name="loopVariableName" placeholder="请输入循环变量" autocomplete="off" class="layui-input input-default" value="{{(d.data.object.loopVariableName || '').replace('"','&quot;')}}">
</div>
</div>
<div class="layui-form-item">
<label class="layui-form-label">循环次数</label>
<div class="layui-input-block">
<input type="text" name="loopCount" placeholder="请输入循环次数" autocomplete="off" class="layui-input" value="{{(d.data.object.loopCount || '').replace('"','&quot;')}}">
<input type="text" name="loopCount" placeholder="请输入循环次数" autocomplete="off" class="layui-input input-default" value="{{(d.data.object.loopCount || '').replace('"','&quot;')}}">
</div>
</div>
<div class="layui-form-item">
<label class="layui-form-label">起始URL</label>
<label class="layui-form-label">延迟时间</label>
<div class="layui-input-block">
<input type="text" name="url" placeholder="请输入起始url" autocomplete="off" class="layui-input" value="{{(d.data.object.url || '').replace('"','&quot;')}}">
<input type="text" name="sleep" placeholder="请输入延迟时间" autocomplete="off" class="layui-input input-default" value="{{(d.data.object.sleep || '').replace('"','&quot;')}}">
</div>
</div>
<div class="layui-form-item">
<label class="layui-form-label">URL</label>
<div class="layui-input-block">
<input type="text" name="url" placeholder="请输入起始url" autocomplete="off" class="layui-input input-default" value="{{(d.data.object.url || '').replace('"','&quot;')}}">
</div>
</div>
<div class="layui-form-item">
@ -36,52 +45,61 @@
<input type="radio" name="method" value="POST" title="POST" {{d.data.object.method == 'POST' ? 'checked': ''}}>
</div>
</div>
{{# layui.each(d.data.object.parameters,function(index,parameter){ }}
<hr>
<div class="layui-form-item layui-form-relative">
<i class="layui-icon layui-icon-close parameter-remove"></i>
<label class="layui-form-label">参数名</label>
<div class="layui-input-block">
<input type="text" name="parameter-name" placeholder="请输入参数名" autocomplete="off" class="layui-input" value="{{(parameter.name || '').replace('"','&quot;')}}">
</div>
</div>
<div class="layui-form-item">
<label class="layui-form-label">参数值</label>
<div class="layui-input-block">
<input type="text" name="parameter-value" placeholder="请输入参数值" autocomplete="off" class="layui-input" value="{{(parameter.value || '').replace('"','&quot;')}}">
</div>
</div>
{{# }) }}
<hr>
<div class="layui-form-item">
<div class="layui-input-block">
<button class="layui-btn parameter-add">添加一个参数</button>
</div>
</div>
{{# layui.each(d.data.object.headers,function(index,header){ }}
<hr>
<div class="layui-form-item layui-form-relative">
<i class="layui-icon layui-icon-close header-remove"></i>
<label class="layui-form-label">header名</label>
<div class="layui-input-block">
<input type="text" name="header-name" placeholder="请输入header名" autocomplete="off" class="layui-input" value="{{(header.name || '').replace('"','&quot;')}}">
</div>
</div>
<div class="layui-form-item">
<label class="layui-form-label">header值</label>
<div class="layui-input-block">
<input type="text" name="header-value" placeholder="请输入header值" autocomplete="off" class="layui-input" value="{{(header.value || '').replace('"','&quot;')}}">
</div>
</div>
{{# }) }}
<hr>
<div class="layui-form-item">
<div class="layui-input-block">
<button class="layui-btn header-add">添加一个Header</button>
</div>
</div>
</div>
</div>
<div class="layui-tab-item">
{{# layui.each(d.data.object.parameters,function(index,parameter){ }}
<div class="layui-form-item layui-form-relative">
<i class="layui-icon layui-icon-close parameter-remove"></i>
<label class="layui-form-label">参数名</label>
<div class="layui-input-block">
<input type="text" name="parameter-name" placeholder="请输入参数名" autocomplete="off" class="layui-input" value="{{(parameter.name || '').replace('"','&quot;')}}">
</div>
</div>
<div class="layui-form-item">
<label class="layui-form-label">参数值</label>
<div class="layui-input-block">
<input type="text" name="parameter-value" placeholder="请输入参数值" autocomplete="off" class="layui-input" value="{{(parameter.value || '').replace('"','&quot;')}}">
</div>
</div>
<hr>
{{# }) }}
<div class="layui-form-item">
<div class="layui-input-block">
<button class="layui-btn parameter-add">添加一个参数</button>
</div>
</div>
</div>
<div class="layui-tab-item">
{{# layui.each(d.data.object.headers,function(index,header){ }}
<div class="layui-form-item layui-form-relative">
<i class="layui-icon layui-icon-close header-remove"></i>
<label class="layui-form-label">header名</label>
<div class="layui-input-block">
<input type="text" name="header-name" placeholder="请输入header名" autocomplete="off" class="layui-input" value="{{(header.name || '').replace('"','&quot;')}}">
</div>
</div>
<div class="layui-form-item">
<label class="layui-form-label">header值</label>
<div class="layui-input-block">
<input type="text" name="header-value" placeholder="请输入header值" autocomplete="off" class="layui-input" value="{{(header.value || '').replace('"','&quot;')}}">
</div>
</div>
<hr>
{{# }) }}
<div class="layui-form-item">
<div class="layui-input-block">
<button class="layui-btn header-add">添加一个Header</button>
</div>
</div>
</div>
<div class="layui-tab-item">
<div class="layui-form-item">
<label class="layui-form-label">代理</label>
<div class="layui-input-block">
<input type="text" name="proxy" placeholder="ip:host" value="{{(d.data.object.proxy || '').replace('"','&quot;')}}" autocomplete="off" class="layui-input input-default">
</div>
</div>
</div>
</div>
</div>

View File

@ -1,38 +1,40 @@
<div class="layui-tab layui-tab-fixed layui-tab-brief">
<ul class="layui-tab-title">
<li class="layui-this">全局配置</li>
<li>全局Header</li>
</ul>
<div class="layui-tab-content">
<div class="layui-tab-content editor-form-node">
<div class="layui-tab-item layui-show">
<div class="layui-form editor-form-node">
<div class="layui-form">
<div class="layui-form-item">
<label class="layui-form-label">爬虫名称</label>
<div class="layui-input-block">
<input type="text" name="spiderName" placeholder="请输入爬虫名称" autocomplete="off" class="layui-input" value="{{d.data.object.spiderName || '未定义名称'}}">
</div>
</div>
{{# layui.each(d.data.object.headers,function(index,header){ }}
<hr>
<div class="layui-form-item layui-form-relative">
<i class="layui-icon layui-icon-close header-remove"></i>
<label class="layui-form-label">header名</label>
<div class="layui-input-block">
<input type="text" name="header-name" placeholder="请输入header名" autocomplete="off" class="layui-input" value="{{header.name || ''}}">
</div>
</div>
<div class="layui-form-item">
<label class="layui-form-label">header值</label>
<div class="layui-input-block">
<input type="text" name="header-value" placeholder="请输入header值" autocomplete="off" class="layui-input" value="{{header.value || ''}}">
</div>
</div>
{{# }) }}
<hr>
<div class="layui-form-item">
</div>
</div>
<div class="layui-tab-item">
{{# layui.each(d.data.object.headers,function(index,header){ }}
<div class="layui-form-item layui-form-relative">
<i class="layui-icon layui-icon-close header-remove"></i>
<label class="layui-form-label">header名</label>
<div class="layui-input-block">
<button class="layui-btn header-add">添加一个Header</button>
</div>
</div>
<input type="text" name="header-name" placeholder="请输入header名" autocomplete="off" class="layui-input" value="{{header.name || ''}}">
</div>
</div>
<div class="layui-form-item">
<label class="layui-form-label">header值</label>
<div class="layui-input-block">
<input type="text" name="header-value" placeholder="请输入header值" autocomplete="off" class="layui-input" value="{{(header.value || '').replace('"','&quot;')}}">
</div>
</div>
<hr>
{{# }) }}
<div class="layui-form-item">
<div class="layui-input-block">
<button class="layui-btn header-add">添加一个Header</button>
</div>
</div>
</div>
</div>