xufei 2 years ago
parent
commit
1caa50ff1f

+ 46 - 0
src/main/java/com/winhc/bigdata/udf/etl/CompanyPhoneOrEmailMerge.java

@@ -0,0 +1,46 @@
+package com.winhc.bigdata.udf.etl;
+
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+import com.winhc.bigdata.utils.CompanyUtils;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+import static com.winhc.bigdata.utils.CompanyUtils.spiltNames;
+import static com.winhc.bigdata.utils.CompanyUtils.spiltNamesPlus;
+
+/**
+ * @author: π
+ * 2022/3/7 11:34
+ * emails   邮箱(分隔符:\t;\t)      最新一年年报邮箱和已有邮箱并集,最新一年排第一个
+ * phones   电话(分隔符:\t;\t)    最新一年年报电话和已有电话并集,最新一年排第一个
+ */
+public class CompanyPhoneOrEmailMerge extends UDF {
+    public List<String> evaluate(String newContent, String oldContent) {
+        List<String> nc = new ArrayList<>(spiltNamesPlus(newContent));
+        Set<String> oc = new HashSet<>(spiltNamesPlus(oldContent));
+        oc.removeAll(nc);
+        nc.addAll(oc);
+        //        String r = String.join("\t;\t", res);
+//        return StringUtils.isBlank(r) ? null : r;
+        return nc.stream()
+                .filter(StringUtils::isNotBlank).collect(Collectors.toList());
+    }
+
+
+    public static void main(String[] args) {
+        String nc = "@@@18916678888中文哈哈哈\t;\t0527-83626688\t;\t";
+//        nc = "";
+//        nc = null;
+        String oc = "0527-83626688\t;\t13611541063天###@\t;\t18916678888\t;\t0527-83626688\t;\t0527-83626688\t;\t";
+//        oc = "";
+//        oc = null;
+        CompanyPhoneOrEmailMerge c = new CompanyPhoneOrEmailMerge();
+        System.out.println(c.evaluate(nc, oc));
+    }
+}

+ 12 - 0
src/main/java/com/winhc/bigdata/utils/CompanyUtils.java

@@ -3,6 +3,7 @@ package com.winhc.bigdata.utils;
 import com.alibaba.fastjson.JSONObject;
 import com.alibaba.fastjson.JSONObject;
 import com.aliyun.odps.utils.StringUtils;
 import com.aliyun.odps.utils.StringUtils;
 import com.github.houbb.opencc4j.util.ZhConverterUtil;
 import com.github.houbb.opencc4j.util.ZhConverterUtil;
+import com.winhc.bigdata.udf.etl.CompanyPhoneOrEmailMerge;
 
 
 import java.util.Arrays;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Collections;
@@ -24,6 +25,17 @@ public class CompanyUtils {
         return Arrays.stream(val.split("\t;\t")).map(String::trim).filter(StringUtils::isNotBlank).map(CompanyUtils::valTrim).collect(Collectors.toList());
         return Arrays.stream(val.split("\t;\t")).map(String::trim).filter(StringUtils::isNotBlank).map(CompanyUtils::valTrim).collect(Collectors.toList());
     }
     }
 
 
+    public static String cleanup2(String s) {
+        Pattern pattern = Pattern.compile("[\\u4e00-\\u9fa5]");
+        if (StringUtils.isBlank(s)) return "";
+        return pattern.matcher(s).replaceAll("");
+    }
+
+    public static List<String> spiltNamesPlus(String val) {
+        return spiltNames(val).stream().map(CompanyUtils::cleanup2)
+                .filter(StringUtils::isNotBlank).collect(Collectors.toList());
+    }
+
 
 
     private static String valTrim(String val) {
     private static String valTrim(String val) {
         if (StringUtils.isBlank(val)) {
         if (StringUtils.isBlank(val)) {