Skip to content

Commit

Permalink
Fix more cases of pages redirecting to themselves
Browse files Browse the repository at this point in the history
This was "fixed" in b4ca6a7 (#763), but the fix turned out to be incomplete.

That fix only allowed redirects leading to the same URL as the original
destination, and didn't take into account more complicated cases. Such
as, for example:

 * www.example.com
 * example.com
 * (set cookie)
 * example.com
  • Loading branch information
WGH- committed Mar 29, 2024
1 parent c8b9cba commit a8b1e36
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 5 deletions.
24 changes: 19 additions & 5 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -1343,12 +1343,26 @@ func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Requ
return fmt.Errorf("Not following redirect to %q: %w", req.URL, err)
}

// allow redirects to the original destination
// to support websites redirecting to the same page while setting
// session cookies
samePageRedirect := normalizeURL(req.URL.String()) == normalizeURL(via[0].URL.String())
// Page may set cookies and respond with a redirect to itself.
// Some example of such redirect "cycles":
//
// example.com -(set cookie)-> example.com
// example.com -> auth.example.com -(set cookie)-> example.com
// www.example.com -> example.com -(set cookie)-> example.com
//
// We must not return "already visited" error in such cases.
// So ignore redirect cycles when checking for URL revisit.
redirectCycle := false
normalizedURL := normalizeURL(req.URL.String())
for _, viaReq := range via {
viaUrl := normalizeURL(viaReq.URL.String())
if viaUrl == normalizedURL {
redirectCycle = true
break
}
}

if !c.AllowURLRevisit && !samePageRedirect {
if !c.AllowURLRevisit && !redirectCycle {
var body io.ReadCloser
if req.GetBody != nil {
var err error
Expand Down
24 changes: 24 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,30 @@ func TestSetCookieRedirect(t *testing.T) {
}
}

func TestSetCookieComplexRedirectCycle(t *testing.T) {
// server2 -> server1 -(set cookie)-> server1
ts1 := newUnstartedTestServer()
ts1.Config.Handler = requireSessionCookieSimple(ts1.Config.Handler)
ts1.Start()
defer ts1.Close()

ts2 := httptest.NewServer(http.RedirectHandler(ts1.URL, http.StatusMovedPermanently))
defer ts2.Close()

c := NewCollector()
c.OnResponse(func(r *Response) {
if got, want := r.Body, serverIndexResponse; !bytes.Equal(got, want) {
t.Errorf("bad response body got=%q want=%q", got, want)
}
if got, want := r.StatusCode, http.StatusOK; got != want {
t.Errorf("bad response code got=%d want=%d", got, want)
}
})
if err := c.Visit(ts2.URL); err != nil {
t.Fatal(err)
}
}

func TestCollectorPostURLRevisitCheck(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down

0 comments on commit a8b1e36

Please sign in to comment.