Przeglądaj źródła

Merge remote-tracking branch 'origin/master'

许家凯 2 lat temu
rodzic
commit
8211ebdc50

+ 70 - 0
src/main/java/com/winhc/bigdata/udf/CompanyOrgTypeNew.java

@@ -0,0 +1,70 @@
+package com.winhc.bigdata.udf;
+
+import com.alibaba.fastjson.JSON;
+import com.aliyun.odps.udf.UDF;
+
+
+import static org.apache.commons.lang.StringUtils.*;
+
+import java.util.*;
+
+/**
+ * @author ZhangJi
+ * @since 2022-05-18 10:55
+ */
+public class CompanyOrgTypeNew extends UDF {
+    private static final List<String> FOREIGN = Arrays.asList("中外", "外资", "外国", "外商");
+    private static final List<String> CHINA = Arrays.asList("国有", "国资");
+
+    public String evaluate(String company_org_type, String company_name, String credit_code) {
+        if (isEmpty(company_name)) return null;
+//分公司判断
+        if (contains(company_org_type, "分公司") || endsWith(company_name, "分公司")) {
+            return JSON.toJSONString(Collections.singletonList("分公司"));
+        }
+
+//工商注册类型判断
+        Set<String> result = new HashSet<>();
+        if (contains(company_org_type, "股份") && !contains(company_org_type, "合作") && contains(company_name, "公司")) {
+            result.add("股份有限公司");
+        } else if (contains(company_org_type, "有限责任") || contains(company_name, "有限责任公司") || contains(company_name, "有限公司")) {
+            result.add("有限责任公司");
+        } else if (contains(company_org_type, "股份合作")) {
+            result.add("股份合作企业");
+        } else if (contains(company_org_type, "个体") || contains(company_org_type, "个人经营")) {
+            result.add("个体工商户");
+        } else if (contains(company_org_type, "专业合作社") || contains(company_name, "专业合作社") || startsWith(credit_code, "93")) {
+            result.add("农民专业合作社");
+        } else if (contains(company_org_type, "集体")) {
+            result.add("集体所有制");
+        } else if (contains(company_org_type, "合伙")) {
+            if (company_org_type.contains("有限")) {
+                result.add("有限合伙");
+            } else if (company_org_type.contains("普通")) {
+                result.add("普通合伙");
+            }
+        } else if (contains(company_org_type, "联营")) {
+            result.add("联营企业");
+        }
+        // 资金类型
+        if (containsAny(company_org_type, "港澳台")) {
+            result.add("港、澳、台商投资企业");
+        } else if (FOREIGN.stream().anyMatch(s -> contains(company_org_type, s))) {
+            result.add("外商投资企业");
+        } else if (CHINA.stream().anyMatch(s -> contains(company_org_type, s))) {
+            result.add("国有企业");
+        }
+        //特殊类型
+        if (contains(company_org_type, "独资")) {
+            result.add("独资企业");
+        }
+        if (result.isEmpty()) {
+            return null;
+        }
+        return JSON.toJSONString(result);
+    }
+
+    public static void main(String[] args) {
+        System.out.println(new CompanyOrgTypeNew().evaluate("分公司", "雅诗兰黛(上海)商贸有限公司北京分公司", "91110105MA01Q19Q2L"));
+    }
+}

+ 30 - 0
src/main/java/com/winhc/bigdata/udf/StringAnyContains.java

@@ -0,0 +1,30 @@
+package com.winhc.bigdata.udf;
+
+
+import com.aliyun.odps.udf.UDF;
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * @author ZhangJi
+ * @since 2022-04-13 18:39
+ */
+public class StringAnyContains extends UDF {
+    public Boolean evaluate(String val,String... args) {
+        if(StringUtils.isEmpty(val))return false;
+        if(args==null||args.length==0) {
+            return false;
+        }
+        for(String arg:args) {
+            if(arg!=null&&val.contains(arg)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    public static void main(String[] args) {
+        StringAnyContains udf=new StringAnyContains();
+
+
+    }
+}

+ 27 - 0
src/main/java/com/winhc/bigdata/udf/StringCleanupChangeInfo.java

@@ -0,0 +1,27 @@
+package com.winhc.bigdata.udf;
+
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.util.regex.Pattern;
+
+/**
+ * @author ZhangJi
+ * @since 2022-03-30 13:35
+ */
+public class StringCleanupChangeInfo extends UDF {
+    private static final Pattern first_p = Pattern.compile("[^\\u4e00-\\u9fa50-9a-zA-Z()()]");
+    private static final Pattern second_p = Pattern.compile("[((][^((]+[))]$");
+    private static final Pattern third_p = Pattern.compile("[^\\u4e00-\\u9fa50-9a-zA-Z]");
+
+    public String evaluate(String val) {
+        if (StringUtils.isEmpty(val)) return "";
+        val = first_p.matcher(val).replaceAll("");
+        val = second_p.matcher(val).replaceAll("");
+        return third_p.matcher(val).replaceAll("");
+    }
+
+    public static void main(String[] args) {
+        System.out.println(new StringCleanupChangeInfo().evaluate("f (xx()))===="));
+    }
+}

+ 24 - 0
src/main/java/com/winhc/bigdata/udf/StringCleanupV2.java

@@ -0,0 +1,24 @@
+package com.winhc.bigdata.udf;
+
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.util.regex.Pattern;
+
+/**
+ * @Author: π
+ * @Date: 2020/5/14 16:26
+ * @Description: 字符串去符号
+ */
+public class StringCleanupV2 extends UDF {
+    private static final Pattern pattern = Pattern.compile("[^\\u4e00-\\u9fa5]");
+
+    public String evaluate(String val) {
+        return StringUtils.isNotBlank(val) ? pattern.matcher(val).replaceAll("") : "";
+    }
+
+    public static void main(String[] args) {
+        StringCleanupV2 j = new StringCleanupV2();
+        System.out.println(j.evaluate("123AFzxc詹三000)(.里"));
+    }
+}

+ 55 - 0
src/main/java/com/winhc/bigdata/udf/ToNum.java

@@ -0,0 +1,55 @@
+package com.winhc.bigdata.udf;
+
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.math.BigDecimal;
+import java.util.regex.Pattern;
+
+/**
+ * @Author: π
+ * @Date: 2020/5/14 16:26
+ * @Description: 字符串去符号
+ */
+public class ToNum extends UDF {
+    private static final Pattern pattern = Pattern.compile("[\\u4e00-\\u9fa5]");
+
+    public Double evaluate(String val,Integer len) {
+        if (StringUtils.isBlank(val)) return null;
+        String num = pattern.matcher(val).replaceAll("");
+        try {
+            return round(num, "1", len);
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+    /**
+     * 进行四舍五入操作 -并保留指定位数
+     *
+     * @param @param  d1		处理数据
+     * @param @param  d2		如果为1 则自己保留多少位,100为除以100之后的结果再保留位数
+     * @param @param  len  小数点保留位数
+     * @param @return 参数
+     * @return String    返回类型
+     * @throws
+     * @Title: round
+     * @Description:
+     */
+    public static double round(String d1, String d2, Integer len) {
+        BigDecimal b1 = new BigDecimal(d1);
+        BigDecimal b2 = new BigDecimal(d2);
+        // 任何一个数字除以1都是原数字
+        // ROUND_HALF_UP是BigDecimal的一个常量,表示进行四舍五入的操作
+        double res = b1.divide(b2, len, BigDecimal.ROUND_HALF_UP).doubleValue();
+        return res;
+    }
+
+    public static void main(String[] args) {
+        ToNum j = new ToNum();
+        System.out.println(j.evaluate("9000.703624001万人民币",6));
+        System.out.println(j.evaluate("0.001000",6));
+        System.out.println(j.evaluate("企业选择不公示",6));
+        System.out.println(j.evaluate("0万元",6));
+    }
+}

+ 46 - 0
src/main/java/com/winhc/bigdata/udf/etl/CompanyPhoneOrEmailMerge.java

@@ -0,0 +1,46 @@
+package com.winhc.bigdata.udf.etl;
+
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+import com.winhc.bigdata.utils.CompanyUtils;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+import static com.winhc.bigdata.utils.CompanyUtils.spiltNames;
+import static com.winhc.bigdata.utils.CompanyUtils.spiltNamesPlus;
+
+/**
+ * @author: π
+ * 2022/3/7 11:34
+ * emails   邮箱(分隔符:\t;\t)      最新一年年报邮箱和已有邮箱并集,最新一年排第一个
+ * phones   电话(分隔符:\t;\t)    最新一年年报电话和已有电话并集,最新一年排第一个
+ */
+public class CompanyPhoneOrEmailMerge extends UDF {
+    public List<String> evaluate(String newContent, String oldContent) {
+        List<String> nc = new ArrayList<>(spiltNamesPlus(newContent));
+        Set<String> oc = new HashSet<>(spiltNamesPlus(oldContent));
+        oc.removeAll(nc);
+        nc.addAll(oc);
+        //        String r = String.join("\t;\t", res);
+//        return StringUtils.isBlank(r) ? null : r;
+        return nc.stream()
+                .filter(StringUtils::isNotBlank).collect(Collectors.toList());
+    }
+
+
+    public static void main(String[] args) {
+        String nc = "@@@18916678888中文哈哈哈\t;\t0527-83626688\t;\t";
+//        nc = "";
+//        nc = null;
+        String oc = "0527-83626688\t;\t13611541063天###@\t;\t18916678888\t;\t0527-83626688\t;\t0527-83626688\t;\t";
+//        oc = "";
+//        oc = null;
+        CompanyPhoneOrEmailMerge c = new CompanyPhoneOrEmailMerge();
+        System.out.println(c.evaluate(nc, oc));
+    }
+}

+ 55 - 0
src/main/java/com/winhc/bigdata/udf/get_json_info.java

@@ -0,0 +1,55 @@
+package com.winhc.bigdata.udf;
+
+import com.alibaba.fastjson.JSON;
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * @author: π
+ * @desc:提取json中k,v
+ */
+public class get_json_info extends UDF {
+
+    public List<String> evaluate(String json, String... keys) {
+        if (StringUtils.isBlank(json) || "[]".equalsIgnoreCase(json) || keys == null) return null;
+        try {
+            List<Map> list1 = JSON.parseArray(json, Map.class);
+            List<String> m1 = list1.stream().map(m -> {
+                ArrayList<String> list = new ArrayList<>();
+                for (String k : keys) {
+                    if (StringUtils.isNotBlank(k)) {
+                        String v = toStringV2(m.getOrDefault(k, ""));
+                        list.add(v);
+                    }
+                }
+                return String.join("@@", list);
+            }).filter(StringUtils::isNotBlank).distinct()
+                    .collect(Collectors.toList());
+            if (m1.isEmpty()) return null;
+            return m1;
+        } catch (Exception e) {
+            e.printStackTrace();
+            return null;
+        }
+
+    }
+
+    public static String toStringV2(Object o) {
+        if (null == o) return null;
+        return o.toString();
+    }
+
+    public static void main(String[] args) {
+        get_json_info j = new get_json_info();
+        String json = "[{\"name\": \"周一凡\", \"litigant_id\": \"\"}, {\"name\": \"深圳市中装建设集团股份有限公司\", \"litigant_id\": \"627a851aeb97a3311b7f43a20d3a0df3\"}, {\"name\": \"杭州创意投资发展有限公司\", \"litigant_id\": \"2772b07553e36f501654c193c2728cd0\"}]";
+        List<String> evaluate = j.evaluate(json, "name", "litigant_id");
+        System.out.println(evaluate);
+    }
+
+}

+ 56 - 0
src/main/java/com/winhc/bigdata/udf/get_legal_v2.java

@@ -0,0 +1,56 @@
+package com.winhc.bigdata.udf;
+
+import com.alibaba.fastjson.JSON;
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * @author: π
+ * 2021/8/30 16:57
+ */
+public class get_legal_v2 extends UDF {
+    private static List<String> cols = Arrays.asList("amomon", "paymet", "time");
+
+    public List<String> evaluate(String json) {
+        if (StringUtils.isBlank(json) || "[]".equalsIgnoreCase(json)) return null;
+        try {
+            List<Map> list1 = JSON.parseArray(json, Map.class);
+            List<String> m1 = list1.stream().map(m -> {
+                String name = toStringV2(m.getOrDefault("name", ""));
+                String id = toStringV2(m.getOrDefault("id", ""));
+                String deleted = toStringV2(m.getOrDefault("deleted", ""));
+                if (deleted.equals("0") || deleted.equals("1")) {
+                    return name + "@@" + id;
+                } else return null;
+            }).filter(StringUtils::isNotBlank).distinct()
+                    .collect(Collectors.toList());
+            if (m1.isEmpty()) return null;
+            return m1;
+        } catch (Exception e) {
+            e.printStackTrace();
+            return null;
+        }
+
+    }
+
+    public static String toStringV2(Object o) {
+        if (null == o) return null;
+        return o.toString();
+    }
+
+    public static void main(String[] args) {
+        get_legal_v2 j = new get_legal_v2();
+        //String json = "[{\"name\":\"冯金元\",\"id\":\"pbc6c4f933e85850d1c401509877671b0\",\"type\":1,\"deleted\":0}]";
+        //String json = "[{\"name\":\"冯金元\",\"id\":\"pbc6c4f933e85850d1c401509877671b0\",\"type\":1,\"deleted\":1}]";
+        String json = "[{\"name\":\"冯金元111\",\"id\":\"pbc6c4f933e85850d1c401509877671b0\",\"type\":1,\"deleted\":9},{\"name\":\"冯金元\",\"id\":\"pbc6c4f933e85850d1c401509877671b0\",\"type\":1,\"deleted\":1},{\"name\":\"冯金元2\",\"id\":\"pbc6c4f933e85850d1c401509877671b0\",\"type\":1,\"deleted\":0}]";
+        List<String> evaluate = j.evaluate(json);
+        System.out.println(evaluate);
+    }
+
+}

+ 57 - 0
src/main/java/com/winhc/bigdata/udf/legal_merge.java

@@ -0,0 +1,57 @@
+package com.winhc.bigdata.udf;
+
+import com.alibaba.fastjson.JSON;
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * @author: π
+ * 2021/8/30 16:57
+ */
+public class legal_merge extends UDF {
+
+    public String evaluate(String old_legal, String new_legal) {
+        if (
+                StringUtils.isBlank(old_legal) || "[]".equalsIgnoreCase(old_legal)
+                        || StringUtils.isBlank(new_legal) || "[]".equalsIgnoreCase(new_legal)
+        ) return old_legal;
+        List<Map> list1 = JSON.parseArray(old_legal, Map.class);
+        List<Map> list2 = JSON.parseArray(new_legal, Map.class);
+        Map<String, String> merge_map = list2.stream().map(m -> {
+            String name = toStringV2(m.getOrDefault("name", ""));
+            String id = toStringV2(m.getOrDefault("id", ""));
+            if (StringUtils.isNotBlank(name)) {
+                return Arrays.asList(name, id);
+            }
+            return null;
+        }).filter(Objects::nonNull).collect(Collectors.toMap(t -> t.get(0), t -> t.get(1), (o, n) -> o));
+
+        List<Map> res = list1.stream().map(m0 -> {
+            String name = toStringV2(m0.getOrDefault("name", ""));
+            if (merge_map.containsKey(name)) {
+                m0.put("id", merge_map.get(name));
+            }
+            return m0;
+        }).collect(Collectors.toList());
+        return JSON.toJSONString(res);
+    }
+
+    public static String toStringV2(Object o) {
+        if (null == o) return null;
+        return o.toString();
+    }
+
+    public static void main(String[] args) {
+        legal_merge j = new legal_merge();
+        //String json = "[{\"name\":\"冯金元\",\"id\":\"pbc6c4f933e85850d1c401509877671b0\",\"type\":1,\"deleted\":0}]";
+        //String json = "[{\"name\":\"冯金元\",\"id\":\"pbc6c4f933e85850d1c401509877671b0\",\"type\":1,\"deleted\":1}]";
+        String json1 = "[{\"name\":\"冯金元111\",\"id\":\"pbc6c4f933e85850d1c401509877671b0\",\"type\":1,\"deleted\":9},{\"name\":\"冯金元\",\"id\":\"pbc6c4f933e85850d1c401509877671b0\",\"type\":1,\"deleted\":1},{\"name\":\"冯金元2\",\"id\":\"pbc6c4f933e85850d1c401509877671b0\",\"type\":1,\"deleted\":0}]";
+        String json2 = "[{\"name\":\"冯金元\",\"id\":\"xxxxc4f933e85850d1c401509877671b0\"},{\"name\":\"冯金元111\",\"id\":\"000004f933e85850d1c401509877671b0\"}]";
+        String evaluate = j.evaluate(json1, json2);
+        System.out.println(evaluate);
+    }
+
+}

+ 53 - 0
src/main/java/com/winhc/bigdata/udf/replace_pid.java

@@ -0,0 +1,53 @@
+package com.winhc.bigdata.udf;
+
+import com.alibaba.fastjson.JSON;
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * @author: π
+ * @desc:替换pid为空
+ */
+public class replace_pid extends UDF {
+
+    public String evaluate(String json, String... keys) {
+        if (StringUtils.isBlank(json) || "[]".equalsIgnoreCase(json) || keys == null) return null;
+        try {
+            List<Map> list1 = JSON.parseArray(json, Map.class);
+            List<Map> m1 = list1.stream().map(m -> {
+                for (String k : keys) {
+                    if (StringUtils.isNotBlank(k)) {
+                        String v = toStringV2(m.getOrDefault(k, ""));
+                        if (v.length() == 33) {
+                            m.put(k, "");
+                        }
+                    }
+                }
+                return m;
+            }).collect(Collectors.toList());
+            return JSON.toJSONString(m1);
+        } catch (Exception e) {
+            e.printStackTrace();
+            return "[]";
+        }
+
+    }
+
+    public static String toStringV2(Object o) {
+        if (null == o) return null;
+        return o.toString();
+    }
+
+    public static void main(String[] args) {
+        replace_pid j = new replace_pid();
+        String json = "[{\"name\": \"周一凡\", \"litigant_id\": \"627a851aeb97a3311b7f43a20d3a0df3\"}, {\"name\": \"深圳市中装建设集团股份有限公司\", \"litigant_id\": \"627a851aeb97a3311b7f43a20d3a0df3\"}, {\"name\": \"杭州创意投资发展有限公司\", \"litigant_id\": \"p2772b07553e36f501654c193c2728cd0\"}]";
+        String evaluate = j.evaluate(json, "litigant_id");
+        System.out.println(evaluate);
+    }
+
+}

+ 12 - 0
src/main/java/com/winhc/bigdata/utils/CompanyUtils.java

@@ -3,6 +3,7 @@ package com.winhc.bigdata.utils;
 import com.alibaba.fastjson.JSONObject;
 import com.aliyun.odps.utils.StringUtils;
 import com.github.houbb.opencc4j.util.ZhConverterUtil;
+import com.winhc.bigdata.udf.etl.CompanyPhoneOrEmailMerge;
 
 import java.util.Arrays;
 import java.util.Collections;
@@ -24,6 +25,17 @@ public class CompanyUtils {
         return Arrays.stream(val.split("\t;\t")).map(String::trim).filter(StringUtils::isNotBlank).map(CompanyUtils::valTrim).collect(Collectors.toList());
     }
 
+    public static String cleanup2(String s) {
+        Pattern pattern = Pattern.compile("[\\u4e00-\\u9fa5]");
+        if (StringUtils.isBlank(s)) return "";
+        return pattern.matcher(s).replaceAll("");
+    }
+
+    public static List<String> spiltNamesPlus(String val) {
+        return spiltNames(val).stream().map(CompanyUtils::cleanup2)
+                .filter(StringUtils::isNotBlank).collect(Collectors.toList());
+    }
+
 
     private static String valTrim(String val) {
         if (StringUtils.isBlank(val)) {