Skip to content

Commit

Permalink
fix: remove default global HTTP timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Sep 9, 2024
1 parent 6bdb2cf commit 03ec807
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 51 deletions.
2 changes: 1 addition & 1 deletion cmd/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func getCMDsFlags(getCmd *cobra.Command) {
getCmd.PersistentFlags().String("prometheus-prefix", "zeno:", "String used as a prefix for the exported Prometheus metrics.")
getCmd.PersistentFlags().Int("max-redirect", 20, "Specifies the maximum number of redirections to follow for a resource.")
getCmd.PersistentFlags().Int("max-retry", 5, "Number of retry if error happen when executing HTTP request.")
getCmd.PersistentFlags().Int("http-timeout", 30, "Number of seconds to wait before timing out a request.")
getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request.")
getCmd.PersistentFlags().Bool("domains-crawl", false, "If this is turned on, seeds will be treated as domains to crawl, therefore same-domain outlinks will be added to the queue as hop=0.")
getCmd.PersistentFlags().StringSlice("disable-html-tag", []string{}, "Specify HTML tag to not extract assets from")
getCmd.PersistentFlags().Bool("capture-alternate-pages", false, "If turned on, <link> HTML tags with \"alternate\" values for their \"rel\" attribute will be archived.")
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.23.1

require (
git.archive.org/wb/gocrawlhq v1.2.7
github.com/CorentinB/warc v0.8.44
github.com/CorentinB/warc v0.8.45
github.com/PuerkitoBio/goquery v1.9.2
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2
github.com/clbanning/mxj/v2 v2.7.0
Expand All @@ -14,6 +14,7 @@ require (
github.com/gosuri/uilive v0.0.4
github.com/gosuri/uitable v0.0.4
github.com/grafana/pyroscope-go v1.1.2
github.com/grafov/m3u8 v0.12.0
github.com/paulbellamy/ratecounter v0.2.0
github.com/philippgille/gokv/leveldb v0.7.0
github.com/prometheus/client_golang v1.20.2
Expand Down Expand Up @@ -49,7 +50,6 @@ require (
github.com/golang/snappy v0.0.4 // indirect
github.com/gomodule/redigo v1.9.2 // indirect
github.com/grafana/pyroscope-go/godeltaprof v0.1.8 // indirect
github.com/grafov/m3u8 v0.12.0 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
git.archive.org/wb/gocrawlhq v1.2.7 h1:+LGu6hcG4xpyHFvmk3TCTEmU90wwWj1RW9PPqWVx9TQ=
git.archive.org/wb/gocrawlhq v1.2.7/go.mod h1:ursn4DkepW9Z6kKMp5qfeZc2+75gcSBmFgoIWGt2sWA=
github.com/CorentinB/warc v0.8.44 h1:dxtImoHbCDQh84yp6XSnHiBP/MGQypJNw9Hovg2zA+Y=
github.com/CorentinB/warc v0.8.44/go.mod h1:V9uPnP4mv6t1VgqrOSJK4wkPajVxhNz5GTrfcIALOXU=
github.com/CorentinB/warc v0.8.45 h1:AqhjgyLyvF2FKj4iI0nAaLGNmoS9wMEVFw4I3Uf9qr8=
github.com/CorentinB/warc v0.8.45/go.mod h1:V9uPnP4mv6t1VgqrOSJK4wkPajVxhNz5GTrfcIALOXU=
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
Expand Down
41 changes: 25 additions & 16 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`)
var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`)

func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error {
func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error {
var resp *http.Response

// Prepare GET request
Expand All @@ -28,8 +28,16 @@ func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error {
return err
}

req.Header.Set("Referer", utils.URLToString(item.ParentURL))
req.Header.Set("User-Agent", c.UserAgent)
// If headers are passed, apply them to the request
// else, apply the default headers
if headers == nil {
for key, value := range headers {
req.Header.Set(key, value)
}
} else {
req.Header.Set("Referer", utils.URLToString(item.ParentURL))
req.Header.Set("User-Agent", c.UserAgent)
}

// Apply cookies obtained from the original URL captured
for i := range cookies {
Expand All @@ -45,23 +53,24 @@ func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error {
defer resp.Body.Close()

if strings.Contains(resp.Header.Get("Content-Type"), "vnd.apple.mpegurl") {
assets, err := extractor.M3U8(resp)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8")
}

c.captureAssets(item, assets, cookies)

return nil
// assets, err := extractor.M3U8(resp)
// if err != nil {
// c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8")
// }
// resp.Body.Close()

// c.captureAssets(item, assets, cookies)

// return nil
} else {
// needed for WARC writing
io.Copy(io.Discard, resp.Body)
}

// needed for WARC writing
io.Copy(io.Discard, resp.Body)

return nil
}

func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie) {
func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) {
// TODO: implement a counter for the number of assets
// currently being processed
// c.Frontier.QueueCount.Incr(int64(len(assets)))
Expand Down Expand Up @@ -109,7 +118,7 @@ func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*ht
}

// Capture the asset
err = c.captureAsset(newAsset, cookies)
err = c.captureAsset(newAsset, cookies, headers)
if err != nil {
c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{
"parentHop": item.Hop,
Expand Down
28 changes: 25 additions & 3 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -343,14 +343,36 @@ func (c *Crawl) Capture(item *queue.Item) error {
// If it was a YouTube watch page, we potentially want to run it through the YouTube extractor
// TODO: support other watch page URLs
if strings.Contains(item.URL.Host, "youtube.com") && strings.Contains(item.URL.Path, "/watch") && !c.NoYTDLP {
URLs, rawJSON, err := youtube.Parse(resp.Body)
URLs, rawJSON, HTTPHeaders, err := youtube.Parse(resp.Body)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing YouTube watch page")
return err
}
resp.Body.Close()

// Build the cookies
// cookies := append([]*http.Cookie{}, &http.Cookie{
// Name: "Accept",
// Value: HTTPHeaders.Accept,
// }, &http.Cookie{
// Name: "Accept-Language",
// Value: HTTPHeaders.AcceptLanguage,
// }, &http.Cookie{
// Name: "Sec-Fetch-Mode",
// Value: HTTPHeaders.SecFetchMode,
// }, &http.Cookie{
// Name: "User-Agent",
// Value: HTTPHeaders.UserAgent,
// })

var headers = make(map[string]string)
headers["Accept"] = HTTPHeaders.Accept
headers["Accept-Language"] = HTTPHeaders.AcceptLanguage
headers["Sec-Fetch-Mode"] = HTTPHeaders.SecFetchMode
headers["User-Agent"] = HTTPHeaders.UserAgent

if len(URLs) > 0 {
c.captureAssets(item, URLs, resp.Cookies())
c.captureAssets(item, URLs, resp.Cookies(), headers)
}

// Write the metadata record for the video
Expand Down Expand Up @@ -545,7 +567,7 @@ func (c *Crawl) Capture(item *queue.Item) error {
}
}

c.captureAssets(item, assets, resp.Cookies())
c.captureAssets(item, assets, resp.Cookies(), nil)

return err
}
8 changes: 6 additions & 2 deletions internal/pkg/crawl/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,12 @@ func (c *Crawl) Start() (err error) {
}
}()

c.Client.Timeout = time.Duration(c.HTTPTimeout) * time.Second
c.Log.Info("HTTP client timeout set", "timeout", c.HTTPTimeout)
if c.HTTPTimeout > 0 {
c.Client.Timeout = time.Duration(c.HTTPTimeout) * time.Second
c.Log.Info("Global HTTP client timeout set", "timeout", c.HTTPTimeout)
} else {
c.Log.Info("Global HTTP client timeout not set")
}

if c.Proxy != "" {
proxyHTTPClientSettings := HTTPClientSettings
Expand Down
18 changes: 10 additions & 8 deletions internal/pkg/crawl/dependencies/ytdlp/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,9 @@ type Video struct {
Fragments []struct {
URL string `json:"url"`
} `json:"fragments"`
HasDrm bool `json:"has_drm"`
Height float64 `json:"height"`
HTTPHeaders struct {
Accept string `json:"Accept"`
AcceptLanguage string `json:"Accept-Language"`
SecFetchMode string `json:"Sec-Fetch-Mode"`
UserAgent string `json:"User-Agent"`
} `json:"http_headers"`
HasDrm bool `json:"has_drm"`
Height float64 `json:"height"`
HTTPHeaders HTTPHeaders `json:"http_headers"`
Language interface{} `json:"language"`
LanguagePreference float64 `json:"language_preference"`
Preference interface{} `json:"preference"`
Expand Down Expand Up @@ -91,3 +86,10 @@ type Video struct {
URL string `json:"url"`
} `json:"thumbnails"`
}

type HTTPHeaders struct {
Accept string `json:"Accept"`
AcceptLanguage string `json:"Accept-Language"`
SecFetchMode string `json:"Sec-Fetch-Mode"`
UserAgent string `json:"User-Agent"`
}
31 changes: 20 additions & 11 deletions internal/pkg/crawl/dependencies/ytdlp/ytdlp.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
"strconv"
)

func GetJSON(port int) (URLs []string, rawJSON string, err error) {
func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders HTTPHeaders, err error) {
// Prepare the command
cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port))

Expand All @@ -20,22 +20,22 @@ func GetJSON(port int) (URLs []string, rawJSON string, err error) {
// Run the command
err = cmd.Run()
if err != nil {
return URLs, rawJSON, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String())
return URLs, rawJSON, HTTPHeaders, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String())
}

output := stdout.String()

// Find subtitles
// subtitleURLs, err := parseSubtitles(output)
// if err != nil {
// return nil, rawJSON, fmt.Errorf("error parsing subtitles: %v", err)
// }
subtitleURLs, err := parseSubtitles(output)
if err != nil {
return nil, rawJSON, HTTPHeaders, fmt.Errorf("error parsing subtitles: %v", err)
}

// Parse the output as a Video object
var video Video
err = json.Unmarshal([]byte(output), &video)
if err != nil {
return nil, rawJSON, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err)
return nil, rawJSON, HTTPHeaders, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err)
}

// Get all thumbnail URLs
Expand All @@ -46,14 +46,23 @@ func GetJSON(port int) (URLs []string, rawJSON string, err error) {
// Get the manifest URL for the best video & audio quality
// Note: we do not archive live streams
if !video.IsLive {
for _, format := range video.RequestedFormats {
URLs = append(URLs, format.URL, format.URL+"&video_id="+video.ID)
if len(video.RequestedFormats) > 0 {
HTTPHeaders = video.RequestedFormats[0].HTTPHeaders
for _, format := range video.RequestedFormats {
URLs = append(URLs, format.URL, format.URL+"&video_id="+video.ID)
}
}
}

//URLs = append(URLs, subtitleURLs...)
// write output to a .json file (debug)
// err = ioutil.WriteFile("output.json", []byte(output), 0644)
// if err != nil {
// return nil, rawJSON, HTTPHeaders, fmt.Errorf("error writing output.json: %v", err)
// }

URLs = append(URLs, subtitleURLs...)

return URLs, output, nil
return URLs, output, HTTPHeaders, nil
}

func FindPath() (string, bool) {
Expand Down
12 changes: 6 additions & 6 deletions internal/pkg/crawl/sitespecific/youtube/youtube.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,29 @@ import (
"github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp"
)

func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, err error) {
func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, HTTPHeaders ytdlp.HTTPHeaders, err error) {
// Create a temporary server to serve the body and call ytdlp on it
port, stopChan, err := ytdlp.ServeBody(body)
if err != nil {
return nil, rawJSON, err
return nil, rawJSON, HTTPHeaders, err
}
defer close(stopChan)

// Call ytdlp on the temporary server
rawURLs, rawJSON, err := ytdlp.GetJSON(port)
rawURLs, rawJSON, HTTPHeaders, err := ytdlp.GetJSON(port)
if err != nil {
return nil, rawJSON, err
return nil, rawJSON, HTTPHeaders, err
}

// Parse the URLs
for _, urlString := range rawURLs {
URL, err := url.Parse(urlString)
if err != nil {
return nil, rawJSON, err
return nil, rawJSON, HTTPHeaders, err
}

URLs = append(URLs, URL)
}

return URLs, rawJSON, nil
return URLs, rawJSON, HTTPHeaders, nil
}

0 comments on commit 03ec807

Please sign in to comment.