소스 검색

Merge remote-tracking branch 'origin/master'

许家凯 4 년 전
부모
커밋
862942c7af
2개의 변경된 파일50개의 추가작업 그리고 2개의 파일을 삭제
  1. 9 2
      src/main/java/com/winhc/bigdata/udf/NameCleanup.java
  2. 41 0
      src/main/java/com/winhc/bigdata/udf/StringCleanupPlus.java

+ 9 - 2
src/main/java/com/winhc/bigdata/udf/NameCleanup.java

@@ -12,9 +12,16 @@ import java.util.regex.Pattern;
  * @Description: 字符串去符号
  */
 public class NameCleanup extends UDF {
-    private static final Pattern pattern = Pattern.compile("[^\\u4e00-\\u9fa50-9a-zA-Z()() ·]");
+    private static final Pattern pattern = Pattern.compile("[^\\u4e00-\\u9fa50-9a-zA-Z()() ·,]");
 
     public String evaluate(String val) {
-        return StringUtils.isNotBlank(val) ? pattern.matcher(val).replaceAll("").replaceAll(" +", " ") : "";
+        return StringUtils.isNotBlank(val) ? pattern.matcher(val.replaceAll("[、,;]",",")).replaceAll("").replaceAll(" +", " ") : "";
+    }
+
+    public static void main(String[] args) {
+        NameCleanup n = new NameCleanup();
+        System.out.println(n.evaluate("执行董事、经理、法定代表人"));
     }
 }
+
+

+ 41 - 0
src/main/java/com/winhc/bigdata/udf/StringCleanupPlus.java

@@ -0,0 +1,41 @@
+package com.winhc.bigdata.udf;
+
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.util.regex.Pattern;
+
+/**
+ * @Description: 全角转换后,字符串去符号
+ */
+public class StringCleanupPlus extends UDF {
+    private static final Pattern pattern = Pattern.compile("[^\\u4e00-\\u9fa50-9a-zA-Z]");
+
+    public String evaluate(String val) {
+        return StringUtils.isNotBlank(val) ? pattern.matcher(ToDBC(val)).replaceAll("") : "";
+    }
+
+
+    public static String ToDBC(String input) {
+        String returnString = "";
+        if(StringUtils.isNotBlank(input)){
+            char c[] = input.toCharArray();
+            for (int i = 0; i < c.length; i++) {
+                if (c[i] == '\u3000') {
+                    c[i] = ' ';
+                } else if (c[i] > '\uFF00' && c[i] < '\uFF5F') {
+                    c[i] = (char) (c[i] - 65248);
+                }
+            }
+            returnString = new String(c);
+        }
+
+        return returnString;
+    }
+
+    public static void main(String[] args) {
+        System.out.println(ToDBC("xx张三zzxxx"));
+        System.out.println(ToDBC(""));
+        System.out.println(new StringCleanupPlus().evaluate("xx张三zz x xx (..."));
+    }
+}