Prechádzať zdrojové kódy

Merge branch 'master' of http://139.224.213.4:3000/bigdata/UDF_Max

JimZhang 2 rokov pred
rodič
commit
4c01e8d9bd

+ 24 - 0
src/main/java/com/winhc/bigdata/udf/StringCleanupV2.java

@@ -0,0 +1,24 @@
+package com.winhc.bigdata.udf;
+
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.util.regex.Pattern;
+
+/**
+ * @Author: π
+ * @Date: 2020/5/14 16:26
+ * @Description: 字符串去符号
+ */
+public class StringCleanupV2 extends UDF {
+    private static final Pattern pattern = Pattern.compile("[^\\u4e00-\\u9fa5]");
+
+    public String evaluate(String val) {
+        return StringUtils.isNotBlank(val) ? pattern.matcher(val).replaceAll("") : "";
+    }
+
+    public static void main(String[] args) {
+        StringCleanupV2 j = new StringCleanupV2();
+        System.out.println(j.evaluate("123AFzxc詹三000)(.里"));
+    }
+}

+ 55 - 0
src/main/java/com/winhc/bigdata/udf/ToNum.java

@@ -0,0 +1,55 @@
+package com.winhc.bigdata.udf;
+
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.math.BigDecimal;
+import java.util.regex.Pattern;
+
+/**
+ * @Author: π
+ * @Date: 2020/5/14 16:26
+ * @Description: 字符串去符号
+ */
+public class ToNum extends UDF {
+    private static final Pattern pattern = Pattern.compile("[\\u4e00-\\u9fa5]");
+
+    public Double evaluate(String val,Integer len) {
+        if (StringUtils.isBlank(val)) return null;
+        String num = pattern.matcher(val).replaceAll("");
+        try {
+            return round(num, "1", len);
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+    /**
+     * 进行四舍五入操作 -并保留指定位数
+     *
+     * @param @param  d1		处理数据
+     * @param @param  d2		如果为1 则自己保留多少位,100为除以100之后的结果再保留位数
+     * @param @param  len  小数点保留位数
+     * @param @return 参数
+     * @return String    返回类型
+     * @throws
+     * @Title: round
+     * @Description:
+     */
+    public static double round(String d1, String d2, Integer len) {
+        BigDecimal b1 = new BigDecimal(d1);
+        BigDecimal b2 = new BigDecimal(d2);
+        // 任何一个数字除以1都是原数字
+        // ROUND_HALF_UP是BigDecimal的一个常量,表示进行四舍五入的操作
+        double res = b1.divide(b2, len, BigDecimal.ROUND_HALF_UP).doubleValue();
+        return res;
+    }
+
+    public static void main(String[] args) {
+        ToNum j = new ToNum();
+        System.out.println(j.evaluate("9000.703624001万人民币",6));
+        System.out.println(j.evaluate("0.001000",6));
+        System.out.println(j.evaluate("企业选择不公示",6));
+        System.out.println(j.evaluate("0万元",6));
+    }
+}

+ 46 - 0
src/main/java/com/winhc/bigdata/udf/etl/CompanyPhoneOrEmailMerge.java

@@ -0,0 +1,46 @@
+package com.winhc.bigdata.udf.etl;
+
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+import com.winhc.bigdata.utils.CompanyUtils;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+import static com.winhc.bigdata.utils.CompanyUtils.spiltNames;
+import static com.winhc.bigdata.utils.CompanyUtils.spiltNamesPlus;
+
+/**
+ * @author: π
+ * 2022/3/7 11:34
+ * emails   邮箱(分隔符:\t;\t)      最新一年年报邮箱和已有邮箱并集,最新一年排第一个
+ * phones   电话(分隔符:\t;\t)    最新一年年报电话和已有电话并集,最新一年排第一个
+ */
+public class CompanyPhoneOrEmailMerge extends UDF {
+    public List<String> evaluate(String newContent, String oldContent) {
+        List<String> nc = new ArrayList<>(spiltNamesPlus(newContent));
+        Set<String> oc = new HashSet<>(spiltNamesPlus(oldContent));
+        oc.removeAll(nc);
+        nc.addAll(oc);
+        //        String r = String.join("\t;\t", res);
+//        return StringUtils.isBlank(r) ? null : r;
+        return nc.stream()
+                .filter(StringUtils::isNotBlank).collect(Collectors.toList());
+    }
+
+
+    public static void main(String[] args) {
+        String nc = "@@@18916678888中文哈哈哈\t;\t0527-83626688\t;\t";
+//        nc = "";
+//        nc = null;
+        String oc = "0527-83626688\t;\t13611541063天###@\t;\t18916678888\t;\t0527-83626688\t;\t0527-83626688\t;\t";
+//        oc = "";
+//        oc = null;
+        CompanyPhoneOrEmailMerge c = new CompanyPhoneOrEmailMerge();
+        System.out.println(c.evaluate(nc, oc));
+    }
+}

+ 12 - 0
src/main/java/com/winhc/bigdata/utils/CompanyUtils.java

@@ -3,6 +3,7 @@ package com.winhc.bigdata.utils;
 import com.alibaba.fastjson.JSONObject;
 import com.aliyun.odps.utils.StringUtils;
 import com.github.houbb.opencc4j.util.ZhConverterUtil;
+import com.winhc.bigdata.udf.etl.CompanyPhoneOrEmailMerge;
 
 import java.util.Arrays;
 import java.util.Collections;
@@ -24,6 +25,17 @@ public class CompanyUtils {
         return Arrays.stream(val.split("\t;\t")).map(String::trim).filter(StringUtils::isNotBlank).map(CompanyUtils::valTrim).collect(Collectors.toList());
     }
 
+    public static String cleanup2(String s) {
+        Pattern pattern = Pattern.compile("[\\u4e00-\\u9fa5]");
+        if (StringUtils.isBlank(s)) return "";
+        return pattern.matcher(s).replaceAll("");
+    }
+
+    public static List<String> spiltNamesPlus(String val) {
+        return spiltNames(val).stream().map(CompanyUtils::cleanup2)
+                .filter(StringUtils::isNotBlank).collect(Collectors.toList());
+    }
+
 
     private static String valTrim(String val) {
         if (StringUtils.isBlank(val)) {