9 changed files with 283 additions and 0 deletions
@ -0,0 +1,83 @@ |
|||||
|
package textcomparison |
||||
|
|
||||
|
import ( |
||||
|
"sync" |
||||
|
|
||||
|
"github.com/yanyiwu/gojieba" |
||||
|
) |
||||
|
|
||||
|
type GoJieba struct { |
||||
|
C *gojieba.Jieba |
||||
|
} |
||||
|
|
||||
|
var GJB *GoJieba |
||||
|
var one sync.Once |
||||
|
|
||||
|
func NewGoJieba() *GoJieba { |
||||
|
one.Do(func() { |
||||
|
GJB = &GoJieba{ |
||||
|
C: gojieba.NewJieba(), |
||||
|
//equals with x := NewJieba(DICT_PATH, HMM_PATH, USER_DICT_PATH)
|
||||
|
} |
||||
|
}) |
||||
|
return GJB |
||||
|
} |
||||
|
|
||||
|
func (this *GoJieba) Close() { |
||||
|
this.C.Free() |
||||
|
} |
||||
|
|
||||
|
func (this *GoJieba) AddWords(words []string) { |
||||
|
for _, word := range words { |
||||
|
this.C.AddWord(word) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
func (this *GoJieba) JiebaCut(rawStr string, useHmm bool, cutAll bool) (words []string) { |
||||
|
if cutAll { |
||||
|
words = jiebaCutAll(this.C, &rawStr) |
||||
|
} else { |
||||
|
words = jiebaCut(this.C, &rawStr, useHmm) |
||||
|
} |
||||
|
|
||||
|
return |
||||
|
} |
||||
|
|
||||
|
func (this *GoJieba) JiebaCutWithFrequency(rawStr string, useHmm bool, cutAll bool) (wordsFreqs map[string]int) { |
||||
|
wordsFreqs = make(map[string]int) |
||||
|
if cutAll { |
||||
|
words := jiebaCutAll(this.C, &rawStr) |
||||
|
for _, word := range words { |
||||
|
freq := wordsFreqs[word] |
||||
|
wordsFreqs[word] = freq + 1 |
||||
|
} |
||||
|
} else { |
||||
|
words := jiebaCut(this.C, &rawStr, useHmm) |
||||
|
for _, word := range words { |
||||
|
freq := wordsFreqs[word] |
||||
|
wordsFreqs[word] = freq + 1 |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return |
||||
|
} |
||||
|
|
||||
|
func (this *GoJieba) JiebaCutForSearch(rawStr string, useHmm bool) { |
||||
|
jiebaCut4Search(this.C, &rawStr, useHmm) |
||||
|
|
||||
|
} |
||||
|
|
||||
|
func jiebaCutAll(x *gojieba.Jieba, rawStr *string) (words []string) { |
||||
|
words = x.CutAll(*rawStr) |
||||
|
return |
||||
|
} |
||||
|
|
||||
|
func jiebaCut(x *gojieba.Jieba, rawStr *string, useHmm bool) (words []string) { |
||||
|
words = x.Cut(*rawStr, useHmm) |
||||
|
return |
||||
|
} |
||||
|
|
||||
|
func jiebaCut4Search(x *gojieba.Jieba, rawStr *string, useHmm bool) (words []string) { |
||||
|
words = x.CutForSearch(*rawStr, useHmm) |
||||
|
return |
||||
|
} |
||||
@ -0,0 +1,109 @@ |
|||||
|
package textcomparison |
||||
|
|
||||
|
import ( |
||||
|
"fmt" |
||||
|
"hash/fnv" |
||||
|
"strings" |
||||
|
) |
||||
|
|
||||
|
const ( |
||||
|
SIMILAR_DISTANCE = 3 |
||||
|
) |
||||
|
|
||||
|
type WordWeight struct { |
||||
|
Word string |
||||
|
Weight float64 |
||||
|
} |
||||
|
|
||||
|
func SimHashSimilar(srcWordWeighs, dstWordWeights []WordWeight) (distance int, err error) { |
||||
|
|
||||
|
srcFingerPrint, err := simhashFingerPrint(srcWordWeighs) |
||||
|
if err != nil { |
||||
|
return |
||||
|
} |
||||
|
fmt.Println("srcFingerPrint: ", srcFingerPrint) |
||||
|
dstFingerPrint, err := simhashFingerPrint(dstWordWeights) |
||||
|
if err != nil { |
||||
|
return |
||||
|
} |
||||
|
fmt.Println("dstFingerPrint: ", dstFingerPrint) |
||||
|
|
||||
|
distance = hammingDistance(srcFingerPrint, dstFingerPrint) |
||||
|
|
||||
|
return |
||||
|
} |
||||
|
|
||||
|
func simhashFingerPrint(wordWeights []WordWeight) (fingerPrint []string, err error) { |
||||
|
binaryWeights := make([]float64, 32) |
||||
|
for _, ww := range wordWeights { |
||||
|
bitHash := strHashBitCode(ww.Word) |
||||
|
weights := calcWithWeight(bitHash, ww.Weight) //binary每个元素与weight的乘积结果数组
|
||||
|
binaryWeights, err = sliceInnerPlus(binaryWeights, weights) |
||||
|
//fmt.Printf("ww.Word:%v, bitHash:%v, ww.Weight:%v, binaryWeights: %v\n", ww.Word,bitHash, ww.Weight, binaryWeights)
|
||||
|
if err != nil { |
||||
|
return |
||||
|
} |
||||
|
} |
||||
|
fingerPrint = make([]string, 0) |
||||
|
for _, b := range binaryWeights { |
||||
|
if b > 0 { // bit 1
|
||||
|
fingerPrint = append(fingerPrint, "1") |
||||
|
} else { // bit 0
|
||||
|
fingerPrint = append(fingerPrint, "0") |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return |
||||
|
} |
||||
|
|
||||
|
func strHashBitCode(str string) string { |
||||
|
h := fnv.New32a() |
||||
|
h.Write([]byte(str)) |
||||
|
b := int64(h.Sum32()) |
||||
|
return fmt.Sprintf("%032b", b) |
||||
|
} |
||||
|
|
||||
|
func calcWithWeight(bitHash string, weight float64) []float64 { |
||||
|
bitHashs := strings.Split(bitHash, "") |
||||
|
binarys := make([]float64, 0) |
||||
|
|
||||
|
for _, bit := range bitHashs { |
||||
|
if bit == "0" { |
||||
|
binarys = append(binarys, float64(-1)*weight) |
||||
|
} else { |
||||
|
binarys = append(binarys, float64(weight)) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return binarys |
||||
|
} |
||||
|
|
||||
|
func sliceInnerPlus(arr1, arr2 []float64) (dstArr []float64, err error) { |
||||
|
dstArr = make([]float64, len(arr1), len(arr1)) |
||||
|
|
||||
|
if arr1 == nil || arr2 == nil { |
||||
|
err = fmt.Errorf("sliceInnerPlus array nil") |
||||
|
return |
||||
|
} |
||||
|
if len(arr1) != len(arr2) { |
||||
|
err = fmt.Errorf("sliceInnerPlus array Length NOT match, %v != %v", len(arr1), len(arr2)) |
||||
|
return |
||||
|
} |
||||
|
|
||||
|
for i, v1 := range arr1 { |
||||
|
dstArr[i] = v1 + arr2[i] |
||||
|
} |
||||
|
|
||||
|
return |
||||
|
} |
||||
|
|
||||
|
func hammingDistance(arr1, arr2 []string) int { |
||||
|
count := 0 |
||||
|
for i, v1 := range arr1 { |
||||
|
if v1 != arr2[i] { |
||||
|
count++ |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return count |
||||
|
} |
||||
@ -0,0 +1,30 @@ |
|||||
|
package textcomparison |
||||
|
|
||||
|
import ( |
||||
|
"regexp" |
||||
|
"strings" |
||||
|
) |
||||
|
|
||||
|
func RemoveHtml(src string) string { |
||||
|
//将HTML标签全转换成小写
|
||||
|
re, _ := regexp.Compile(`\\<[\\S\\s]+?\\>`) |
||||
|
src = re.ReplaceAllStringFunc(src, strings.ToLower) |
||||
|
|
||||
|
//去除STYLE
|
||||
|
re, _ = regexp.Compile(`\\<style[\\S\\s]+?\\</style\\>`) |
||||
|
src = re.ReplaceAllString(src, "") |
||||
|
|
||||
|
//去除SCRIPT
|
||||
|
re, _ = regexp.Compile(`\\<script[\\S\\s]+?\\</script\\>`) |
||||
|
src = re.ReplaceAllString(src, "") |
||||
|
|
||||
|
//去除所有尖括号内的HTML代码,并换成换行符
|
||||
|
re, _ = regexp.Compile(`\\<[\\S\\s]+?\\>`) |
||||
|
src = re.ReplaceAllString(src, "\n") |
||||
|
|
||||
|
//去除连续的换行符
|
||||
|
re, _ = regexp.Compile(`\\s{2,}`) |
||||
|
src = re.ReplaceAllString(src, "\n") |
||||
|
|
||||
|
return src |
||||
|
} |
||||
Loading…
Reference in new issue