-
Notifications
You must be signed in to change notification settings - Fork 1.8k
/
htmlelement.go
131 lines (118 loc) · 4.09 KB
/
htmlelement.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"strings"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
)
// HTMLElement is the representation of a HTML tag.
type HTMLElement struct {
// Name is the name of the tag
Name string
Text string
attributes []html.Attribute
// Request is the request object of the element's HTML document
Request *Request
// Response is the Response object of the element's HTML document
Response *Response
// DOM is the goquery parsed DOM object of the page. DOM is relative
// to the current HTMLElement
DOM *goquery.Selection
// Index stores the position of the current element within all the elements matched by an OnHTML callback
Index int
}
// NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node.
func NewHTMLElementFromSelectionNode(resp *Response, s *goquery.Selection, n *html.Node, idx int) *HTMLElement {
return &HTMLElement{
Name: n.Data,
Request: resp.Request,
Response: resp,
Text: goquery.NewDocumentFromNode(n).Text(),
DOM: s,
Index: idx,
attributes: n.Attr,
}
}
// Attr returns the selected attribute of a HTMLElement or empty string
// if no attribute found
func (h *HTMLElement) Attr(k string) string {
for _, a := range h.attributes {
if a.Key == k {
return a.Val
}
}
return ""
}
// ChildText returns the concatenated and stripped text content of the matching
// elements.
func (h *HTMLElement) ChildText(goquerySelector string) string {
return strings.TrimSpace(h.DOM.Find(goquerySelector).Text())
}
// ChildTexts returns the stripped text content of all the matching
// elements.
func (h *HTMLElement) ChildTexts(goquerySelector string) []string {
var res []string
h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
res = append(res, strings.TrimSpace(s.Text()))
})
return res
}
// ChildAttr returns the stripped text content of the first matching
// element's attribute.
func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string {
if attr, ok := h.DOM.Find(goquerySelector).Attr(attrName); ok {
return strings.TrimSpace(attr)
}
return ""
}
// ChildAttrs returns the stripped text content of all the matching
// element's attributes.
func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string {
var res []string
h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
if attr, ok := s.Attr(attrName); ok {
res = append(res, strings.TrimSpace(attr))
}
})
return res
}
// ForEach iterates over the elements matched by the first argument
// and calls the callback function on every HTMLElement match.
func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement)) {
i := 0
h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
for _, n := range s.Nodes {
callback(i, NewHTMLElementFromSelectionNode(h.Response, s, n, i))
i++
}
})
}
// ForEachWithBreak iterates over the elements matched by the first argument
// and calls the callback function on every HTMLElement match.
// It is identical to ForEach except that it is possible to break
// out of the loop by returning false in the callback function. It returns the
// current Selection object.
func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool) {
i := 0
h.DOM.Find(goquerySelector).EachWithBreak(func(_ int, s *goquery.Selection) bool {
for _, n := range s.Nodes {
if callback(i, NewHTMLElementFromSelectionNode(h.Response, s, n, i)) {
i++
return true
}
}
return false
})
}