增强正则抽取函数

2019-10-22 11:09:17 +08:00 · 2019-10-22 11:09:17 +08:00 · af830b9cc7
commit af830b9cc7
parent 54602dc3a1
6 changed files with 158 additions and 10 deletions
--- a/spider-flow-core/src/main/java/org/spiderflow/core/executor/function/ExtractFunctionExecutor.java
+++ b/spider-flow-core/src/main/java/org/spiderflow/core/executor/function/ExtractFunctionExecutor.java
@ -31,12 +31,36 @@ public class ExtractFunctionExecutor implements FunctionExecutor{
 		return ExtractUtils.getFirstMatcher(content, pattern, true);
 	}
 	
+	@Comment("根据正则表达式提取内容")
+	@Example("${extract.regx(resp.html,'<title>(.*?)</title>',1)}")
+	public static String regx(String content,String pattern,int groupIndex){
+		return ExtractUtils.getFirstMatcher(content, pattern, groupIndex);
+	}
+	
+	@Comment("根据正则表达式提取内容")
+	@Example("${extract.regx(resp.html,'<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
+	public static List<String> regx(String content,String pattern,List<Integer> groups){
+		return ExtractUtils.getFirstMatcher(content, pattern, groups);
+	}
+	
 	@Comment("根据正则表达式提取内容")
 	@Example("${extract.regxs(resp.html,'<h2>(.*?)</h2>')}")
 	public static List<String> regxs(String content,String pattern){
 		return ExtractUtils.getMatchers(content, pattern, true);
 	}
 	
+	@Comment("根据正则表达式提取内容")
+	@Example("${extract.regxs(resp.html,'<h2>(.*?)</h2>',1)}")
+	public static List<String> regxs(String content,String pattern,int groupIndex){
+		return ExtractUtils.getMatchers(content, pattern, groupIndex);
+	}
+	
+	@Comment("根据正则表达式提取内容")
+	@Example("${extract.regxs(resp.html,'<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
+	public static List<List<String>> regxs(String content,String pattern,List<Integer> groups){
+		return ExtractUtils.getMatchers(content, pattern, groups);
+	}
+	
 	@Comment("根据xpath提取内容")
 	@Example("${extract.xpath(resp.element(),'//title/text()')}")
 	public static String xpath(Element element,String xpath){
--- a/spider-flow-core/src/main/java/org/spiderflow/core/executor/function/extension/ElementFunctionExtension.java
+++ b/spider-flow-core/src/main/java/org/spiderflow/core/executor/function/extension/ElementFunctionExtension.java
@ -41,11 +41,35 @@ public class ElementFunctionExtension implements FunctionExtension{
 	}
 	
 	@Comment("根据正则表达式提取内容")
-	@Example("${elementVar.regx('<h2>(.*?)</h2>')}")
+	@Example("${elementVar.regx('<title>(.*?)</title>',1)}")
+	public static String regx(Element element,String regx,int groupIndex){
+		return ExtractUtils.getFirstMatcher(element.html(), regx, groupIndex);
+	}
+	
+	@Comment("根据正则表达式提取内容")
+	@Example("${elementVar.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
+	public static List<String> regx(Element element,String regx,List<Integer> groups){
+		return ExtractUtils.getFirstMatcher(element.html(), regx, groups);
+	}
+	
+	@Comment("根据正则表达式提取内容")
+	@Example("${elementVar.regxs('<h2>(.*?)</h2>')}")
 	public static List<String> regxs(Element element,String regx){
 		return ExtractUtils.getMatchers(element.html(), regx, true);
 	}
 	
+	@Comment("根据正则表达式提取内容")
+	@Example("${elementVar.regxs('<h2>(.*?)</h2>',1)}")
+	public static List<String> regxs(Element element,String regx,int groupIndex){
+		return ExtractUtils.getMatchers(element.html(), regx, groupIndex);
+	}
+	
+	@Comment("根据正则表达式提取内容")
+	@Example("${elementVar.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
+	public static List<List<String>> regxs(Element element,String regx,List<Integer> groups){
+		return ExtractUtils.getMatchers(element.html(), regx, groups);
+	}
+	
 	@Comment("根据css选择器提取内容")
 	@Example("${elementVar.selector('div > a')}")
 	public static Element selector(Element element,String cssQuery){
--- a/spider-flow-core/src/main/java/org/spiderflow/core/executor/function/extension/ElementsFunctionExtension.java
+++ b/spider-flow-core/src/main/java/org/spiderflow/core/executor/function/extension/ElementsFunctionExtension.java
@ -40,11 +40,35 @@ public class ElementsFunctionExtension implements FunctionExtension{
 	}
 	
 	@Comment("根据正则表达式提取内容")
-	@Example("${elementsVar.regx('<h2>(.*?)</h2>')}")
+	@Example("${elementsVar.regx('<title>(.*?)</title>',1)}")
+	public static String regx(Elements elements,String regx,int groupIndex){
+		return ExtractUtils.getFirstMatcher(elements.html(), regx, groupIndex);
+	}
+	
+	@Comment("根据正则表达式提取内容")
+	@Example("${elementsVar.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
+	public static List<String> regx(Elements elements,String regx,List<Integer> groups){
+		return ExtractUtils.getFirstMatcher(elements.html(), regx, groups);
+	}
+	
+	@Comment("根据正则表达式提取内容")
+	@Example("${elementsVar.regxs('<h2>(.*?)</h2>')}")
 	public static List<String> regxs(Elements elements,String regx){
 		return ExtractUtils.getMatchers(elements.html(), regx, true);
 	}
 	
+	@Comment("根据正则表达式提取内容")
+	@Example("${elementsVar.regxs('<h2>(.*?)</h2>',1)}")
+	public static List<String> regxs(Elements elements,String regx,int groupIndex){
+		return ExtractUtils.getMatchers(elements.html(), regx, groupIndex);
+	}
+	
+	@Comment("根据正则表达式提取内容")
+	@Example("${elementsVar.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
+	public static List<List<String>> regxs(Elements elements,String regx,List<Integer> groups){
+		return ExtractUtils.getMatchers(elements.html(), regx, groups);
+	}
+	
 	@Comment("根据css选择器提取内容")
 	@Example("${elementsVar.selector('div > a')}")
 	public static Element selector(Elements elements,String selector){
--- a/spider-flow-core/src/main/java/org/spiderflow/core/executor/function/extension/ResponseFunctionExtension.java
+++ b/spider-flow-core/src/main/java/org/spiderflow/core/executor/function/extension/ResponseFunctionExtension.java
@ -46,12 +46,36 @@ public class ResponseFunctionExtension implements FunctionExtension{
 		return ExtractUtils.getFirstMatcher(response.getHtml(), pattern, true);
 	}
 	
+	@Comment("根据正则表达式提取请求结果中的内容")
+	@Example("${resp.regx('<title>(.*?)</title>',1)}")
+	public static String regx(SpiderResponse response,String pattern,int groupIndex){
+		return ExtractUtils.getFirstMatcher(response.getHtml(), pattern, groupIndex);
+	}
+	
+	@Comment("根据正则表达式提取请求结果中的内容")
+	@Example("${resp.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
+	public static List<String> regx(SpiderResponse response,String pattern,List<Integer> groups){
+		return ExtractUtils.getFirstMatcher(response.getHtml(), pattern, groups);
+	}
+	
 	@Comment("根据正则表达式提取请求结果中的内容")
 	@Example("${resp.regxs('<h2>(.*?)</h2>')}")
 	public static List<String> regxs(SpiderResponse response,String pattern){
 		return ExtractUtils.getMatchers(response.getHtml(), pattern, true);
 	}
 	
+	@Comment("根据正则表达式提取请求结果中的内容")
+	@Example("${resp.regxs('<h2>(.*?)</h2>',1)}")
+	public static List<String> regxs(SpiderResponse response,String pattern,int groupIndex){
+		return ExtractUtils.getMatchers(response.getHtml(), pattern, groupIndex);
+	}
+	
+	@Comment("根据正则表达式提取请求结果中的内容")
+	@Example("${resp.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
+	public static List<List<String>> regxs(SpiderResponse response,String pattern,List<Integer> groups){
+		return ExtractUtils.getMatchers(response.getHtml(), pattern, groups);
+	}
+	
 	@Comment("根据css选择器提取请求结果")
 	@Example("${resp.selector('div > a')}")
 	public static Element selector(SpiderResponse response,String selector){
--- a/spider-flow-core/src/main/java/org/spiderflow/core/executor/function/extension/StringFunctionExtension.java
+++ b/spider-flow-core/src/main/java/org/spiderflow/core/executor/function/extension/StringFunctionExtension.java
@ -33,11 +33,35 @@ public class StringFunctionExtension implements FunctionExtension{
 	}
 	
 	@Comment("根据正则表达式提取String中的内容")
-	@Example("${strVar.regx('<h2>(.*?)</h2>')}")
+	@Example("${strVar.regx('<title>(.*?)</title>',1)}")
+	public static String regx(String source,String pattern,int groupIndex){
+		return ExtractUtils.getFirstMatcher(source, pattern, groupIndex);
+	}
+	
+	@Comment("根据正则表达式提取String中的内容")
+	@Example("${strVar.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
+	public static List<String> regx(String source,String pattern,List<Integer> groups){
+		return ExtractUtils.getFirstMatcher(source, pattern, groups);
+	}
+	
+	@Comment("根据正则表达式提取String中的内容")
+	@Example("${strVar.regxs('<h2>(.*?)</h2>')}")
 	public static List<String> regxs(String source,String pattern){
 		return ExtractUtils.getMatchers(source, pattern, true);
 	}
 	
+	@Comment("根据正则表达式提取String中的内容")
+	@Example("${strVar.regxs('<h2>(.*?)</h2>',1)}")
+	public static List<String> regxs(String source,String pattern,int groupIndex){
+		return ExtractUtils.getMatchers(source, pattern, groupIndex);
+	}
+	
+	@Comment("根据正则表达式提取String中的内容")
+	@Example("${strVar.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
+	public static List<List<String>> regxs(String source,String pattern,List<Integer> groups){
+		return ExtractUtils.getMatchers(source, pattern, groups);
+	}
+	
 	@Comment("根据xpath在String变量中查找")
 	@Example("${strVar.xpath('//title/text()')}")
 	@Return({Element.class,String.class})
--- a/spider-flow-core/src/main/java/org/spiderflow/core/utils/ExtractUtils.java
+++ b/spider-flow-core/src/main/java/org/spiderflow/core/utils/ExtractUtils.java
@ -12,7 +12,6 @@ import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import org.seimicrawler.xpath.JXDocument;
 import org.seimicrawler.xpath.JXNode;
-import org.springframework.util.StringUtils;

 import com.alibaba.fastjson.JSONPath;
 /**
@ -34,26 +33,55 @@ public class ExtractUtils {
 	}
 	
 	public static List<String> getMatchers(String content,String regx,boolean isGroup){
+		return getMatchers(content,regx,isGroup ? 1: 0);
+	}
+	
+	public static List<String> getMatchers(String content,String regx,int groupIndex){
 		Matcher matcher = compile(regx).matcher(content);
 		List<String> results = new ArrayList<>();
 		while(matcher.find()){
-			String group = isGroup ? matcher.group(1) : matcher.group();
-			if(!StringUtils.isEmpty(group)){
-				results.add(group);
+			results.add(matcher.group(groupIndex));
+		}
+		return results;
+	}
+	
+	public static List<List<String>> getMatchers(String content,String regx,List<Integer> groups){
+		Matcher matcher = compile(regx).matcher(content);
+		List<List<String>> results = new ArrayList<>();
+		while(matcher.find()){
+			List<String> matches = new ArrayList<>();
+			for (Integer groupIndex : groups) {
+				matches.add(matcher.group(groupIndex));
 			}
+			results.add(matches);
 		}
 		return results;
 	}
 	
 	public static String getFirstMatcher(String content,String regx,boolean isGroup){
+		
+		return getFirstMatcher(content,regx,isGroup ? 1 : 0);
+	}
+	
+	public static String getFirstMatcher(String content,String regx,int groupIndex){
 		Matcher matcher = compile(regx).matcher(content);
-		while(matcher.find()){
-			String group = isGroup ? matcher.group(1) : matcher.group();
-			return group;
+		if(matcher.find()){
+			return matcher.group(groupIndex);
 		}
 		return null;
 	}
 	
+	public static List<String> getFirstMatcher(String content,String regx,List<Integer> groups){
+		Matcher matcher = compile(regx).matcher(content);
+		List<String> matches = new ArrayList<>();
+		if(matcher.find()){
+			for (Integer groupIndex : groups) {
+				matches.add(matcher.group(groupIndex));
+			}
+		}
+		return matches;
+	}
+	
 	public static String getHostFromURL(String url){
 		return getFirstMatcher(url, "(?<=//|)((\\w)+\\.)+\\w+", false);
 	}