需求:a表某字段模糊匹配b表字段值 > 50%
实现思路:1.通过最长公共子序列算法计算重合率(如下)
2.可通过ElasticSearch通过ik分词ik_max_word 实现最细粒度的拆分(未实现)
//计算最长公共子序列
public static int longestCommonSubsequence(String str1, String str2) {
if (str1 == null || str2 == null) return 0;
int m = str1.length(), n = str2.length();
int[][] cache = new int[m + 1][n + 1];
for (int i = m - 1; i >= 0; i--) {
for (int j = n - 1; j >= 0; j--) {
if (str1.charAt(i) == str2.charAt(j)) cache[i][j] = cache[i + 1][j + 1] + 1;
else cache[i][j] = Math.max(cache[i][j + 1], cache[i + 1][j]);
}
}
return cache[0][0];
}
//计算重合率
public static double coincidenceRate(String str1, String str2, int length) {
int coincidenc = longestCommonSubsequence(str1, str2);
return txfloat(coincidenc, length);
}
public static void main(String[] args) {
String text1 = "水系综合治理及运营维护PPP项目";
String text2 = "水系综合治理及运营维护PPP项目(赤星河)新店镇赤星村委会250kVA永久性用电";
double l = coincidenceRate(text1,text2,text2.length());
System.out.println(l);
}