java 相似度计算
public class Same {
public static void main(String[] args) {
//读取excel中的数据进行相似度的计算
File file = new File("D:\\1.xlsx");
try {
XSSFWorkbook xssfWorkbook = new XSSFWorkbook(new FileInputStream(file));
int tabIndex = 0;
Sheet sheet = xssfWorkbook.getSheetAt(tabIndex);
Row row = null;
Cell cell1 = null;
List<String> list = new ArrayList<>();
HashMap map = new HashMap();
for (int i = 0; i <= 899; i++) {
row = sheet.getRow(i); //指定行
cell1 = row.getCell(0); //指定列
cell1.setCellType(CellType.STRING);
String cellValue0 = cell1.getStringCellValue();
list.add(cellValue0);
}
//双重for循环,去掉重复的比较
for (int i = 0; i < list.size()-1; i++) {
for (int j = i+1; j < list.size(); j++) {
String a = list.get(i);
String b = list.get(j);
float levenshtein = Levenshtein(a, b);
if (levenshtein >= 0.80) {
System.out.println(a+"<--->"+b+","+levenshtein+"\r\n");
}
}
}
System.out.println(map.size());
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 相似度公共方法
* @param a
* @param b
* @return
*/
public static float Levenshtein(String a, String b) {
if (a == null && b == null) {
return 1f;
}
if (a == null || b == null) {
return 0F;
}
int editDistance = editDis(a, b);
return 1 - ((float) editDistance / Math.max(a.length(), b.length()));
}
private static int editDis(String a, String b) {
int aLen = a.length();
int bLen = b.length();
if (aLen == 0) {
return aLen;
}
if (bLen == 0) {
return bLen;
}
int[][] v = new int[aLen + 1][bLen + 1];
for (int i = 0; i <= aLen; ++i) {
for (int j = 0; j <= bLen; ++j) {
if (i == 0) {
v[i][j] = j;
} else if (j == 0) {
v[i][j] = i;
} else if (a.charAt(i - 1) == b.charAt(j - 1)) {
v[i][j] = v[i - 1][j - 1];
} else {
v[i][j] = 1 + Math.min(v[i - 1][j - 1], Math.min(v[i][j - 1], v[i - 1][j]));
}
}
}
return v[aLen][bLen];
}
}