ps:
package main
import (
"fmt"
"strings"
)
func main() {
fmt.Println(hemingwayDistance(simHash("rng tes jdg edg"), simHash("rng edg tes jdg")))
fmt.Println(hemingwayDistance(simHash("gen.g t1 dk drx"), simHash("rng edg tes jdg")))
fmt.Println(hemingwayDistance(simHash("wo zhen de hui xie"), simHash("ting wo shuo xie xie ni")))
fmt.Println(hemingwayDistance(simHash("wo zhen de hui xie"), simHash("wo shi zhen de xie xie ni")))
}
//计算simHash值
func simHash(statement string) int {
words := strings.Split(statement, " ")
bitCount := make([]int, 32)
for _, word := range words {
h := stringHash(word)
for i := 0; i < 32; i++ {
if h%2 == 1 {
bitCount[i]++
} else {
bitCount[i]--
}
h /= 2
}
}
ans := 0
x := 1
for i := 0; i < 32; i++ {
if bitCount[i] > 0 {
ans += x
}
x *= 2
}
return ans
}
//计算普通的字符串hash值
func stringHash(str string) int {
h := 0
bytes := []byte(str)
for i := 0; i < len(bytes); i++ {
h = 31*h + (int(bytes[i]) & 0xff)
}
return h
}
//计算海明威距离
func hemingwayDistance(a, b int) int {
count := 0
for i := 0; i < 32; i++ {
if a%2 != b%2 {
count++
}
a /= 2
b /= 2
}
return count
}
输出:
0
5
6
1
ps.一般<=3就认为比较相似,海明威距离值越大说明相差越多。