From bdb140e4632ab3717133aee866eb93fd6b048903 Mon Sep 17 00:00:00 2001 From: FanOne Date: Sun, 5 Nov 2023 02:29:37 +0000 Subject: [PATCH] refator: extract the segment with weight module --- .gitignore | 1 + hmm/idf/tag_extracker.go | 46 ++++++---------------------------------- hmm/idf/textrank.go | 11 +++++----- hmm/segment/segment.go | 36 +++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 45 deletions(-) create mode 100644 hmm/segment/segment.go diff --git a/.gitignore b/.gitignore index 456bc14..5413be3 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,4 @@ _testmain.go .glide/ examples/dict/embed/embed examples/dict/embed/main +oryxBuildBinary \ No newline at end of file diff --git a/hmm/idf/tag_extracker.go b/hmm/idf/tag_extracker.go index e227849..4603b74 100644 --- a/hmm/idf/tag_extracker.go +++ b/hmm/idf/tag_extracker.go @@ -6,43 +6,9 @@ import ( "unicode/utf8" "github.com/go-ego/gse" + "github.com/go-ego/gse/hmm/segment" ) -// Segment type a word with weight. -type Segment struct { - text string - weight float64 -} - -// Text return the segment's text. -func (s Segment) Text() string { - return s.text -} - -// Weight return the segment's weight. -func (s Segment) Weight() float64 { - return s.weight -} - -// Segments type a slice of Segment. -type Segments []Segment - -func (ss Segments) Len() int { - return len(ss) -} - -func (ss Segments) Less(i, j int) bool { - if ss[i].weight == ss[j].weight { - return ss[i].text < ss[j].text - } - - return ss[i].weight < ss[j].weight -} - -func (ss Segments) Swap(i, j int) { - ss[i], ss[j] = ss[j], ss[i] -} - // TagExtracter is extract tags struct. type TagExtracter struct { seg gse.Segmenter @@ -82,7 +48,7 @@ func (t *TagExtracter) LoadStopWords(fileName ...string) error { } // ExtractTags extract the topK key words from text. -func (t *TagExtracter) ExtractTags(text string, topK int) (tags Segments) { +func (t *TagExtracter) ExtractTags(text string, topK int) (tags segment.Segments) { freqMap := make(map[string]float64) for _, w := range t.seg.Cut(text, true) { @@ -110,13 +76,13 @@ func (t *TagExtracter) ExtractTags(text string, topK int) (tags Segments) { freqMap[k] = v / total } - ws := make(Segments, 0) - var s Segment + ws := make(segment.Segments, 0) + var s segment.Segment for k, v := range freqMap { if freq, _, ok := t.Idf.Freq(k); ok { - s = Segment{text: k, weight: freq * v} + s = segment.Segment{Text: k, Weight: freq * v} } else { - s = Segment{text: k, weight: t.Idf.median * v} + s = segment.Segment{Text: k, Weight: t.Idf.median * v} } ws = append(ws, s) } diff --git a/hmm/idf/textrank.go b/hmm/idf/textrank.go index 81d5609..5fa64a0 100644 --- a/hmm/idf/textrank.go +++ b/hmm/idf/textrank.go @@ -6,6 +6,7 @@ import ( "github.com/go-ego/gse" "github.com/go-ego/gse/hmm/pos" + "github.com/go-ego/gse/hmm/segment" ) const dampingFactor = 0.85 @@ -81,7 +82,7 @@ func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) { edge{start: end, end: start, weight: weight}) } -func (u *undirectWeightedGraph) rank() Segments { +func (u *undirectWeightedGraph) rank() segment.Segments { if !sort.IsSorted(u.keys) { sort.Sort(u.keys) } @@ -124,10 +125,10 @@ func (u *undirectWeightedGraph) rank() Segments { } } - result := make(Segments, 0) + result := make(segment.Segments, 0) for n, w := range ws { result = append(result, - Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)}, + segment.Segment{Text: n, Weight: (w - minRank/10.0) / (maxRank - minRank/10.0)}, ) } @@ -137,7 +138,7 @@ func (u *undirectWeightedGraph) rank() Segments { // TextRankWithPOS extracts keywords from text using TextRank algorithm. // Parameter allowPOS allows a []string pos list. -func (t *TextRanker) TextRankWithPOS(text string, topK int, allowPOS []string) Segments { +func (t *TextRanker) TextRankWithPOS(text string, topK int, allowPOS []string) segment.Segments { posFilt := make(map[string]int) for _, pos1 := range allowPOS { posFilt[pos1] = 1 @@ -181,6 +182,6 @@ func (t *TextRanker) TextRankWithPOS(text string, topK int, allowPOS []string) S // TextRank extract keywords from text using TextRank algorithm. // Parameter topK specify how many top keywords to be returned at most. -func (t *TextRanker) TextRank(text string, topK int) Segments { +func (t *TextRanker) TextRank(text string, topK int) segment.Segments { return t.TextRankWithPOS(text, topK, defaultAllowPOS) } diff --git a/hmm/segment/segment.go b/hmm/segment/segment.go new file mode 100644 index 0000000..34ffb58 --- /dev/null +++ b/hmm/segment/segment.go @@ -0,0 +1,36 @@ +package segment + +// Segment type a word with weight. +type Segment struct { + Text string + Weight float64 +} + +// Text return the segment's text. +func (s Segment) GetText() string { + return s.Text +} + +// Weight return the segment's weight. +func (s Segment) GetWeight() float64 { + return s.Weight +} + +// Segments type a slice of Segment. +type Segments []Segment + +func (ss Segments) Len() int { + return len(ss) +} + +func (ss Segments) Less(i, j int) bool { + if ss[i].Weight == ss[j].Weight { + return ss[i].Text < ss[j].Text + } + + return ss[i].Weight < ss[j].Weight +} + +func (ss Segments) Swap(i, j int) { + ss[i], ss[j] = ss[j], ss[i] +}