|
@@ -0,0 +1,41 @@
|
|
|
|
+package com.winhc.bigdata.udf;
|
|
|
|
+
|
|
|
|
+import com.aliyun.odps.udf.UDF;
|
|
|
|
+import com.aliyun.odps.utils.StringUtils;
|
|
|
|
+
|
|
|
|
+import java.util.regex.Pattern;
|
|
|
|
+
|
|
|
|
+/**
|
|
|
|
+ * @Description: 全角转换后,字符串去符号
|
|
|
|
+ */
|
|
|
|
+public class StringCleanupPlus extends UDF {
|
|
|
|
+ private static final Pattern pattern = Pattern.compile("[^\\u4e00-\\u9fa50-9a-zA-Z]");
|
|
|
|
+
|
|
|
|
+ public String evaluate(String val) {
|
|
|
|
+ return StringUtils.isNotBlank(val) ? pattern.matcher(ToDBC(val)).replaceAll("") : "";
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ public static String ToDBC(String input) {
|
|
|
|
+ String returnString = "";
|
|
|
|
+ if(StringUtils.isNotBlank(input)){
|
|
|
|
+ char c[] = input.toCharArray();
|
|
|
|
+ for (int i = 0; i < c.length; i++) {
|
|
|
|
+ if (c[i] == '\u3000') {
|
|
|
|
+ c[i] = ' ';
|
|
|
|
+ } else if (c[i] > '\uFF00' && c[i] < '\uFF5F') {
|
|
|
|
+ c[i] = (char) (c[i] - 65248);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ returnString = new String(c);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ return returnString;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public static void main(String[] args) {
|
|
|
|
+ System.out.println(ToDBC("xx张三zzxxx"));
|
|
|
|
+ System.out.println(ToDBC(""));
|
|
|
|
+ System.out.println(new StringCleanupPlus().evaluate("xx张三zz x xx (..."));
|
|
|
|
+ }
|
|
|
|
+}
|