Skip to content

Commit

Permalink
Merge pull request #794 from Shinku-Chen/master
Browse files Browse the repository at this point in the history
Fix Bug: retry scrape will lost POST requestData
  • Loading branch information
WGH- authored Mar 25, 2024
2 parents 3c987f1 + 5ba23d8 commit 4ccfe78
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 1 deletion.
12 changes: 11 additions & 1 deletion colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,8 @@ var (
ErrQueueFull = errors.New("Queue MaxSize reached")
// ErrMaxRequests is the error returned when exceeding max requests
ErrMaxRequests = errors.New("Max Requests limit reached")
// ErrRetryBodyUnseekable is the error when retry with not seekable body
ErrRetryBodyUnseekable = errors.New("Retry Body Unseekable")
)

var envMap = map[string]func(*Collector, string){
Expand Down Expand Up @@ -629,6 +631,13 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c
if _, ok := hdr["User-Agent"]; !ok {
hdr.Set("User-Agent", c.UserAgent)
}
if seeker, ok := requestData.(io.ReadSeeker); ok {
_, err := seeker.Seek(0, io.SeekStart)
if err != nil {
return err
}
}

req, err := http.NewRequest(method, parsedURL.String(), requestData)
if err != nil {
return err
Expand Down Expand Up @@ -1440,7 +1449,8 @@ func createMultipartReader(boundary string, data map[string][]byte) io.Reader {
buffer.WriteString("\n")
}
buffer.WriteString(dashBoundary + "--\n\n")
return buffer
return bytes.NewReader(buffer.Bytes())

}

// randomBoundary was borrowed from
Expand Down
79 changes: 79 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1703,3 +1703,82 @@ func requireSessionCookieAuthPage(handler http.Handler) http.Handler {
handler.ServeHTTP(w, r)
})
}

func TestCollectorPostRetry(t *testing.T) {
ts := newTestServer()
defer ts.Close()

postValue := "hello"
c := NewCollector()
try := false
c.OnResponse(func(r *Response) {
if r.Ctx.Get("notFirst") == "" {
r.Ctx.Put("notFirst", "first")
_ = r.Request.Retry()
return
}
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}
try = true
})

c.Post(ts.URL+"/login", map[string]string{
"name": postValue,
})
if !try {
t.Error("OnResponse Retry was not called")
}
}
func TestCollectorGetRetry(t *testing.T) {
ts := newTestServer()
defer ts.Close()
try := false

c := NewCollector()

c.OnResponse(func(r *Response) {
if r.Ctx.Get("notFirst") == "" {
r.Ctx.Put("notFirst", "first")
_ = r.Request.Retry()
return
}
if !bytes.Equal(r.Body, serverIndexResponse) {
t.Error("Response body does not match with the original content")
}
try = true
})

c.Visit(ts.URL)
if !try {
t.Error("OnResponse Retry was not called")
}
}

func TestCollectorPostRetryUnseekable(t *testing.T) {
ts := newTestServer()
defer ts.Close()
try := false
postValue := "hello"
c := NewCollector()

c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}

if r.Ctx.Get("notFirst") == "" {
r.Ctx.Put("notFirst", "first")
err := r.Request.Retry()
if !errors.Is(err, ErrRetryBodyUnseekable) {
t.Errorf("Unexpected error Type ErrRetryBodyUnseekable : %v", err)
}
return
}
try = true
})
c.Request("POST", ts.URL+"/login", bytes.NewBuffer([]byte("name="+postValue)), nil, nil)
if try {
t.Error("OnResponse Retry was called but BodyUnseekable")
}
}
3 changes: 3 additions & 0 deletions request.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,9 @@ func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error
// Retry submits HTTP request again with the same parameters
func (r *Request) Retry() error {
r.Headers.Del("Cookie")
if _, ok := r.Body.(io.ReadSeeker); r.Body != nil && !ok {
return ErrRetryBodyUnseekable
}
return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, false)
}

Expand Down

0 comments on commit 4ccfe78

Please sign in to comment.