|
@@ -0,0 +1,37 @@
|
|
|
+package com.winhc.bigdata.udf;
|
|
|
+
|
|
|
+import com.aliyun.odps.udf.UDF;
|
|
|
+import org.apache.commons.lang.StringUtils;
|
|
|
+
|
|
|
+import java.util.regex.Matcher;
|
|
|
+import java.util.regex.Pattern;
|
|
|
+
|
|
|
+/**
|
|
|
+ * @Author: π
|
|
|
+ * @Description: url 提取
|
|
|
+ */
|
|
|
+public class get_url extends UDF {
|
|
|
+
|
|
|
+ Pattern pattern2 = Pattern.compile("^((http://)|(https://))?([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z0-9]{2,6}(/)");
|
|
|
+ Pattern pattern1 = Pattern.compile("http.+?html|http.+?cn|http.+?com|http.+/");
|
|
|
+
|
|
|
+ public String evaluate(String url) {
|
|
|
+ if (StringUtils.isBlank(url)) {
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+ Matcher matcher = pattern2.matcher(url);
|
|
|
+ if (matcher.find()) {
|
|
|
+ System.out.println(matcher.group());
|
|
|
+ return matcher.group();
|
|
|
+ }
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+
|
|
|
+ public static void main(String[] args) {
|
|
|
+ String str1 ="http://ssfw.zqdhfy.gov.cn/ktxx.aspx?cateId=15";
|
|
|
+ String str ="http://114.252.22.26.35/xx/yy/ktxx.aspx?cateId=15";
|
|
|
+ String res = new get_url().evaluate(str);
|
|
|
+ System.out.println(res);
|
|
|
+
|
|
|
+ }
|
|
|
+}
|