9 changed files with 283 additions and 0 deletions
@ -0,0 +1,83 @@ |
|||
package textcomparison |
|||
|
|||
import ( |
|||
"sync" |
|||
|
|||
"github.com/yanyiwu/gojieba" |
|||
) |
|||
|
|||
type GoJieba struct { |
|||
C *gojieba.Jieba |
|||
} |
|||
|
|||
var GJB *GoJieba |
|||
var one sync.Once |
|||
|
|||
func NewGoJieba() *GoJieba { |
|||
one.Do(func() { |
|||
GJB = &GoJieba{ |
|||
C: gojieba.NewJieba(), |
|||
//equals with x := NewJieba(DICT_PATH, HMM_PATH, USER_DICT_PATH)
|
|||
} |
|||
}) |
|||
return GJB |
|||
} |
|||
|
|||
func (this *GoJieba) Close() { |
|||
this.C.Free() |
|||
} |
|||
|
|||
func (this *GoJieba) AddWords(words []string) { |
|||
for _, word := range words { |
|||
this.C.AddWord(word) |
|||
} |
|||
} |
|||
|
|||
func (this *GoJieba) JiebaCut(rawStr string, useHmm bool, cutAll bool) (words []string) { |
|||
if cutAll { |
|||
words = jiebaCutAll(this.C, &rawStr) |
|||
} else { |
|||
words = jiebaCut(this.C, &rawStr, useHmm) |
|||
} |
|||
|
|||
return |
|||
} |
|||
|
|||
func (this *GoJieba) JiebaCutWithFrequency(rawStr string, useHmm bool, cutAll bool) (wordsFreqs map[string]int) { |
|||
wordsFreqs = make(map[string]int) |
|||
if cutAll { |
|||
words := jiebaCutAll(this.C, &rawStr) |
|||
for _, word := range words { |
|||
freq := wordsFreqs[word] |
|||
wordsFreqs[word] = freq + 1 |
|||
} |
|||
} else { |
|||
words := jiebaCut(this.C, &rawStr, useHmm) |
|||
for _, word := range words { |
|||
freq := wordsFreqs[word] |
|||
wordsFreqs[word] = freq + 1 |
|||
} |
|||
} |
|||
|
|||
return |
|||
} |
|||
|
|||
func (this *GoJieba) JiebaCutForSearch(rawStr string, useHmm bool) { |
|||
jiebaCut4Search(this.C, &rawStr, useHmm) |
|||
|
|||
} |
|||
|
|||
func jiebaCutAll(x *gojieba.Jieba, rawStr *string) (words []string) { |
|||
words = x.CutAll(*rawStr) |
|||
return |
|||
} |
|||
|
|||
func jiebaCut(x *gojieba.Jieba, rawStr *string, useHmm bool) (words []string) { |
|||
words = x.Cut(*rawStr, useHmm) |
|||
return |
|||
} |
|||
|
|||
func jiebaCut4Search(x *gojieba.Jieba, rawStr *string, useHmm bool) (words []string) { |
|||
words = x.CutForSearch(*rawStr, useHmm) |
|||
return |
|||
} |
|||
@ -0,0 +1,109 @@ |
|||
package textcomparison |
|||
|
|||
import ( |
|||
"fmt" |
|||
"hash/fnv" |
|||
"strings" |
|||
) |
|||
|
|||
const ( |
|||
SIMILAR_DISTANCE = 3 |
|||
) |
|||
|
|||
type WordWeight struct { |
|||
Word string |
|||
Weight float64 |
|||
} |
|||
|
|||
func SimHashSimilar(srcWordWeighs, dstWordWeights []WordWeight) (distance int, err error) { |
|||
|
|||
srcFingerPrint, err := simhashFingerPrint(srcWordWeighs) |
|||
if err != nil { |
|||
return |
|||
} |
|||
fmt.Println("srcFingerPrint: ", srcFingerPrint) |
|||
dstFingerPrint, err := simhashFingerPrint(dstWordWeights) |
|||
if err != nil { |
|||
return |
|||
} |
|||
fmt.Println("dstFingerPrint: ", dstFingerPrint) |
|||
|
|||
distance = hammingDistance(srcFingerPrint, dstFingerPrint) |
|||
|
|||
return |
|||
} |
|||
|
|||
func simhashFingerPrint(wordWeights []WordWeight) (fingerPrint []string, err error) { |
|||
binaryWeights := make([]float64, 32) |
|||
for _, ww := range wordWeights { |
|||
bitHash := strHashBitCode(ww.Word) |
|||
weights := calcWithWeight(bitHash, ww.Weight) //binary每个元素与weight的乘积结果数组
|
|||
binaryWeights, err = sliceInnerPlus(binaryWeights, weights) |
|||
//fmt.Printf("ww.Word:%v, bitHash:%v, ww.Weight:%v, binaryWeights: %v\n", ww.Word,bitHash, ww.Weight, binaryWeights)
|
|||
if err != nil { |
|||
return |
|||
} |
|||
} |
|||
fingerPrint = make([]string, 0) |
|||
for _, b := range binaryWeights { |
|||
if b > 0 { // bit 1
|
|||
fingerPrint = append(fingerPrint, "1") |
|||
} else { // bit 0
|
|||
fingerPrint = append(fingerPrint, "0") |
|||
} |
|||
} |
|||
|
|||
return |
|||
} |
|||
|
|||
func strHashBitCode(str string) string { |
|||
h := fnv.New32a() |
|||
h.Write([]byte(str)) |
|||
b := int64(h.Sum32()) |
|||
return fmt.Sprintf("%032b", b) |
|||
} |
|||
|
|||
func calcWithWeight(bitHash string, weight float64) []float64 { |
|||
bitHashs := strings.Split(bitHash, "") |
|||
binarys := make([]float64, 0) |
|||
|
|||
for _, bit := range bitHashs { |
|||
if bit == "0" { |
|||
binarys = append(binarys, float64(-1)*weight) |
|||
} else { |
|||
binarys = append(binarys, float64(weight)) |
|||
} |
|||
} |
|||
|
|||
return binarys |
|||
} |
|||
|
|||
func sliceInnerPlus(arr1, arr2 []float64) (dstArr []float64, err error) { |
|||
dstArr = make([]float64, len(arr1), len(arr1)) |
|||
|
|||
if arr1 == nil || arr2 == nil { |
|||
err = fmt.Errorf("sliceInnerPlus array nil") |
|||
return |
|||
} |
|||
if len(arr1) != len(arr2) { |
|||
err = fmt.Errorf("sliceInnerPlus array Length NOT match, %v != %v", len(arr1), len(arr2)) |
|||
return |
|||
} |
|||
|
|||
for i, v1 := range arr1 { |
|||
dstArr[i] = v1 + arr2[i] |
|||
} |
|||
|
|||
return |
|||
} |
|||
|
|||
func hammingDistance(arr1, arr2 []string) int { |
|||
count := 0 |
|||
for i, v1 := range arr1 { |
|||
if v1 != arr2[i] { |
|||
count++ |
|||
} |
|||
} |
|||
|
|||
return count |
|||
} |
|||
@ -0,0 +1,30 @@ |
|||
package textcomparison |
|||
|
|||
import ( |
|||
"regexp" |
|||
"strings" |
|||
) |
|||
|
|||
func RemoveHtml(src string) string { |
|||
//将HTML标签全转换成小写
|
|||
re, _ := regexp.Compile(`\\<[\\S\\s]+?\\>`) |
|||
src = re.ReplaceAllStringFunc(src, strings.ToLower) |
|||
|
|||
//去除STYLE
|
|||
re, _ = regexp.Compile(`\\<style[\\S\\s]+?\\</style\\>`) |
|||
src = re.ReplaceAllString(src, "") |
|||
|
|||
//去除SCRIPT
|
|||
re, _ = regexp.Compile(`\\<script[\\S\\s]+?\\</script\\>`) |
|||
src = re.ReplaceAllString(src, "") |
|||
|
|||
//去除所有尖括号内的HTML代码,并换成换行符
|
|||
re, _ = regexp.Compile(`\\<[\\S\\s]+?\\>`) |
|||
src = re.ReplaceAllString(src, "\n") |
|||
|
|||
//去除连续的换行符
|
|||
re, _ = regexp.Compile(`\\s{2,}`) |
|||
src = re.ReplaceAllString(src, "\n") |
|||
|
|||
return src |
|||
} |
|||
Loading…
Reference in new issue