Browse Source

分词相似度处理

qin_1
超级管理员 4 years ago
parent
commit
5c8b240835
  1. 2
      gin_server_admin/api/admin/dutyassess/assesstype.go
  2. 12
      gin_server_admin/api/admin/dutyassess/dutyhandle.go
  3. 83
      gin_server_admin/api/textcomparison/goJieba.go
  4. 109
      gin_server_admin/api/textcomparison/similarity.go
  5. 30
      gin_server_admin/api/textcomparison/utils.go
  6. 42
      gin_server_admin/api/v1/shiyan/shiyan.go
  7. 1
      gin_server_admin/go.mod
  8. 2
      gin_server_admin/go.sum
  9. 2
      gin_server_admin/router/shiyan/sys_shiyan.go

2
gin_server_admin/api/admin/dutyassess/assesstype.go

@ -152,6 +152,8 @@ type OutDutyList struct {
State int `json:"state"`
DepartmentId []int64 `json:"departmentid"`
DepartmentAry []DepartmentAryType `json:"departmentmap"`
UserList []string `json:"userlist"` //执行人列表
UserListAry []QualEvalArrt `json:"userlistary"` //执行人列表
}
type DepartmentAryType struct {
Parentid string `json:"parentid"`

12
gin_server_admin/api/admin/dutyassess/dutyhandle.go

@ -85,6 +85,18 @@ func (d *DutyAssessApi) DutyDepartmentList(c *gin.Context) {
targetCont.State = 1
targetCont.DepartmentId, targetCont.DepartmentAry = GetDutyFangAnDepart(v, 1)
shenPiRen := strings.Split(tarCont.Report, ",")
targetCont.UserList = shenPiRen
for _, v := range shenPiRen {
usCont, usErr := commonus.GetUserInfoPublic([]string{"worker_man.wm_number", "worker_man_data.wmd_name"}, map[string]interface{}{"wm_key": v})
if usErr == true {
var userCont QualEvalArrt
userCont.Id = v
userCont.Name = usCont.Name
targetCont.UserListAry = append(targetCont.UserListAry, userCont)
}
}
targetList = append(targetList, targetCont)
}
// var qualitativeEvaluationAry []assessmentmodel.QualitativeEvaluation

83
gin_server_admin/api/textcomparison/goJieba.go

@ -0,0 +1,83 @@
package textcomparison
import (
"sync"
"github.com/yanyiwu/gojieba"
)
type GoJieba struct {
C *gojieba.Jieba
}
var GJB *GoJieba
var one sync.Once
func NewGoJieba() *GoJieba {
one.Do(func() {
GJB = &GoJieba{
C: gojieba.NewJieba(),
//equals with x := NewJieba(DICT_PATH, HMM_PATH, USER_DICT_PATH)
}
})
return GJB
}
func (this *GoJieba) Close() {
this.C.Free()
}
func (this *GoJieba) AddWords(words []string) {
for _, word := range words {
this.C.AddWord(word)
}
}
func (this *GoJieba) JiebaCut(rawStr string, useHmm bool, cutAll bool) (words []string) {
if cutAll {
words = jiebaCutAll(this.C, &rawStr)
} else {
words = jiebaCut(this.C, &rawStr, useHmm)
}
return
}
func (this *GoJieba) JiebaCutWithFrequency(rawStr string, useHmm bool, cutAll bool) (wordsFreqs map[string]int) {
wordsFreqs = make(map[string]int)
if cutAll {
words := jiebaCutAll(this.C, &rawStr)
for _, word := range words {
freq := wordsFreqs[word]
wordsFreqs[word] = freq + 1
}
} else {
words := jiebaCut(this.C, &rawStr, useHmm)
for _, word := range words {
freq := wordsFreqs[word]
wordsFreqs[word] = freq + 1
}
}
return
}
func (this *GoJieba) JiebaCutForSearch(rawStr string, useHmm bool) {
jiebaCut4Search(this.C, &rawStr, useHmm)
}
func jiebaCutAll(x *gojieba.Jieba, rawStr *string) (words []string) {
words = x.CutAll(*rawStr)
return
}
func jiebaCut(x *gojieba.Jieba, rawStr *string, useHmm bool) (words []string) {
words = x.Cut(*rawStr, useHmm)
return
}
func jiebaCut4Search(x *gojieba.Jieba, rawStr *string, useHmm bool) (words []string) {
words = x.CutForSearch(*rawStr, useHmm)
return
}

109
gin_server_admin/api/textcomparison/similarity.go

@ -0,0 +1,109 @@
package textcomparison
import (
"fmt"
"hash/fnv"
"strings"
)
const (
SIMILAR_DISTANCE = 3
)
type WordWeight struct {
Word string
Weight float64
}
func SimHashSimilar(srcWordWeighs, dstWordWeights []WordWeight) (distance int, err error) {
srcFingerPrint, err := simhashFingerPrint(srcWordWeighs)
if err != nil {
return
}
fmt.Println("srcFingerPrint: ", srcFingerPrint)
dstFingerPrint, err := simhashFingerPrint(dstWordWeights)
if err != nil {
return
}
fmt.Println("dstFingerPrint: ", dstFingerPrint)
distance = hammingDistance(srcFingerPrint, dstFingerPrint)
return
}
func simhashFingerPrint(wordWeights []WordWeight) (fingerPrint []string, err error) {
binaryWeights := make([]float64, 32)
for _, ww := range wordWeights {
bitHash := strHashBitCode(ww.Word)
weights := calcWithWeight(bitHash, ww.Weight) //binary每个元素与weight的乘积结果数组
binaryWeights, err = sliceInnerPlus(binaryWeights, weights)
//fmt.Printf("ww.Word:%v, bitHash:%v, ww.Weight:%v, binaryWeights: %v\n", ww.Word,bitHash, ww.Weight, binaryWeights)
if err != nil {
return
}
}
fingerPrint = make([]string, 0)
for _, b := range binaryWeights {
if b > 0 { // bit 1
fingerPrint = append(fingerPrint, "1")
} else { // bit 0
fingerPrint = append(fingerPrint, "0")
}
}
return
}
func strHashBitCode(str string) string {
h := fnv.New32a()
h.Write([]byte(str))
b := int64(h.Sum32())
return fmt.Sprintf("%032b", b)
}
func calcWithWeight(bitHash string, weight float64) []float64 {
bitHashs := strings.Split(bitHash, "")
binarys := make([]float64, 0)
for _, bit := range bitHashs {
if bit == "0" {
binarys = append(binarys, float64(-1)*weight)
} else {
binarys = append(binarys, float64(weight))
}
}
return binarys
}
func sliceInnerPlus(arr1, arr2 []float64) (dstArr []float64, err error) {
dstArr = make([]float64, len(arr1), len(arr1))
if arr1 == nil || arr2 == nil {
err = fmt.Errorf("sliceInnerPlus array nil")
return
}
if len(arr1) != len(arr2) {
err = fmt.Errorf("sliceInnerPlus array Length NOT match, %v != %v", len(arr1), len(arr2))
return
}
for i, v1 := range arr1 {
dstArr[i] = v1 + arr2[i]
}
return
}
func hammingDistance(arr1, arr2 []string) int {
count := 0
for i, v1 := range arr1 {
if v1 != arr2[i] {
count++
}
}
return count
}

30
gin_server_admin/api/textcomparison/utils.go

@ -0,0 +1,30 @@
package textcomparison
import (
"regexp"
"strings"
)
func RemoveHtml(src string) string {
//将HTML标签全转换成小写
re, _ := regexp.Compile(`\\<[\\S\\s]+?\\>`)
src = re.ReplaceAllStringFunc(src, strings.ToLower)
//去除STYLE
re, _ = regexp.Compile(`\\<style[\\S\\s]+?\\</style\\>`)
src = re.ReplaceAllString(src, "")
//去除SCRIPT
re, _ = regexp.Compile(`\\<script[\\S\\s]+?\\</script\\>`)
src = re.ReplaceAllString(src, "")
//去除所有尖括号内的HTML代码,并换成换行符
re, _ = regexp.Compile(`\\<[\\S\\s]+?\\>`)
src = re.ReplaceAllString(src, "\n")
//去除连续的换行符
re, _ = regexp.Compile(`\\s{2,}`)
src = re.ReplaceAllString(src, "\n")
return src
}

42
gin_server_admin/api/v1/shiyan/shiyan.go

@ -9,6 +9,7 @@ import (
"strconv"
"time"
"github.com/flipped-aurora/gin-vue-admin/server/api/textcomparison"
"github.com/flipped-aurora/gin-vue-admin/server/api/wechatapp/sendmessage"
"github.com/flipped-aurora/gin-vue-admin/server/commonus"
"github.com/flipped-aurora/gin-vue-admin/server/global"
@ -1341,3 +1342,44 @@ func DelteGroup(group, depart, dime, target, suntar, detaid int64, runDepart str
}
}
}
//文本实验 textcomparison
func (s *ShiyanApi) TextShiyan(c *gin.Context) {
participle := textcomparison.NewGoJieba()
//去除文本Html标签
srcStr := textcomparison.RemoveHtml("关于区块链和数字货币的关系,很多人或多或少都存在疑惑。简单来说,区块链是比特币的底层运用,而比特币只是区块链的一个小应用而已。" +
"数字货币即虚拟货币,最早的数字货币诞生于2009年,其发明者中本聪为了应对经济危机对于实体货币经济的冲击。比特币是最早的数字货币,后来出现了以太币、火币以及莱特币等虚拟货币,这些虚拟货币是不能用来交易的。" +
"狭义来讲,区块链是一种按照时间顺序将数据区块以顺序相连的方式组合成的一种链式数据结构, 并以密码学方式保证的不可篡改和不可伪造的分布式账本。" +
"广义来讲,区块链技术是利用块链式数据结构来验证与存储数据、利用分布式节点共识算法来生成和更新数据、利用密码学的方式保证数据传输和访问的安全、利用由自动化脚本代码组成的智能合约来编程和操作数据的一种全新的分布式基础架构与计算方式。")
dstStr := textcomparison.RemoveHtml("区块链技术为我们的信息防伪与数据追踪提供了革新手段。区块链中的数据区块顺序相连构成了一个不可篡改的数据链条,时间戳为所有的交易行为贴上了一套不讲课伪造的真是标签,这对于人们在现实生活中打击假冒伪劣产品大有裨益; " +
"市场分析指出,整体而言,区块链技术目前在十大金融领域显示出应用前景,分别是资产证券化、保险、供应链金融、场外市场、资产托管、大宗商品交易、风险信息共享机制、贸易融资、银团贷款、股权交易交割。" +
"这些金融场景有三大共性:参与节点多、验真成本高、交易流程长,而区块链的分布式记账、不可篡改、内置合约等特性可以为这些金融业务中的痛点提供解决方案。" +
"传统的工业互联网模式是由一个中心化的机构收集和管理所有的数据信息,容易产生因设备生命周期和安全等方面的缺陷引起的数据丢失、篡改等问题。区块链技术可以在无需任何信任单个节点的同时构建整个网络的信任共识,从而很好的解决目前工业互联网技术领域的一些缺陷,让物与物之间能够实现更好的连接.")
srcWordsWeight := participle.C.ExtractWithWeight(srcStr, 30)
dstWordsWeight := participle.C.ExtractWithWeight(dstStr, 30)
fmt.Printf("srcWordsWeight: %v\n", srcWordsWeight)
fmt.Printf("dstWordsWeight: %v\n", dstWordsWeight)
srcWords := make([]textcomparison.WordWeight, len(srcWordsWeight))
dstWords := make([]textcomparison.WordWeight, len(dstWordsWeight))
for i, ww := range srcWordsWeight {
word := textcomparison.WordWeight{Word: ww.Word, Weight: ww.Weight}
srcWords[i] = word
}
for i, ww := range dstWordsWeight {
word := textcomparison.WordWeight{Word: ww.Word, Weight: ww.Weight}
dstWords[i] = word
}
fmt.Printf("srcWords:%v\n", srcWords)
fmt.Printf("dstWords:%v\n", dstWords)
distance, err := textcomparison.SimHashSimilar(srcWords, dstWords)
if err != nil {
// t.Errorf("failed: %v", err.Error())
fmt.Printf("failed:%v\n", c.Errors)
}
fmt.Printf("SimHashSimilar distance:%v\n", distance)
// t.Logf("SimHashSimilar distance: %v", distance)
}

1
gin_server_admin/go.mod

@ -35,6 +35,7 @@ require (
github.com/unrolled/render v1.4.0
github.com/unrolled/secure v1.0.7
github.com/xuri/excelize/v2 v2.4.1
github.com/yanyiwu/gojieba v1.1.2
go.uber.org/zap v1.16.0
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c
gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect

2
gin_server_admin/go.sum

@ -503,6 +503,8 @@ github.com/xuri/efp v0.0.0-20210322160811-ab561f5b45e3 h1:EpI0bqf/eX9SdZDwlMmahK
github.com/xuri/efp v0.0.0-20210322160811-ab561f5b45e3/go.mod h1:ybY/Jr0T0GTCnYjKqmdwxyxn2BQf2RcQIIvex5QldPI=
github.com/xuri/excelize/v2 v2.4.1 h1:veeeFLAJwsNEBPBlDepzPIYS1eLyBVcXNZUW79exZ1E=
github.com/xuri/excelize/v2 v2.4.1/go.mod h1:rSu0C3papjzxQA3sdK8cU544TebhrPUoTOaGPIh0Q1A=
github.com/yanyiwu/gojieba v1.1.2 h1:BMwKCwg3G+Nw/Ctqzm/gNgN/6Ljf0Y4f7ddKlzTA+TM=
github.com/yanyiwu/gojieba v1.1.2/go.mod h1:54wkP7sMJ6bklf7yPl6F+JG71dzVUU1WigZbR47nGdY=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
github.com/zenazn/goji v0.9.0/go.mod h1:7S9M489iMyHBNxwZnk9/EHS098H4/F6TATF2mIxtB1Q=

2
gin_server_admin/router/shiyan/sys_shiyan.go

@ -30,5 +30,7 @@ func (s *ShiyanRouter) InitShiyanRouter(Router *gin.RouterGroup) {
shiyanCodeRouter.POST("/datetime", authorityApi.DateTime) //测试时间
shiyanCodeRouter.GET("deltdutychongfu", authorityApi.DeltDutyChongfu) //方案去重
shiyanCodeRouter.POST("textshiyan", authorityApi.TextShiyan) //比对文本
}
}

Loading…
Cancel
Save