From 03ec807bdba050468b2e6b875c190c09fee8a122 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Mon, 9 Sep 2024 15:59:20 +0200 Subject: [PATCH] fix: remove default global HTTP timeout --- cmd/get.go | 2 +- go.mod | 4 +- go.sum | 4 +- internal/pkg/crawl/assets.go | 41 +++++++++++-------- internal/pkg/crawl/capture.go | 28 +++++++++++-- internal/pkg/crawl/crawl.go | 8 +++- .../pkg/crawl/dependencies/ytdlp/model.go | 18 ++++---- .../pkg/crawl/dependencies/ytdlp/ytdlp.go | 31 +++++++++----- .../pkg/crawl/sitespecific/youtube/youtube.go | 12 +++--- 9 files changed, 97 insertions(+), 51 deletions(-) diff --git a/cmd/get.go b/cmd/get.go index 794ac12..2b73b6c 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -43,7 +43,7 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().String("prometheus-prefix", "zeno:", "String used as a prefix for the exported Prometheus metrics.") getCmd.PersistentFlags().Int("max-redirect", 20, "Specifies the maximum number of redirections to follow for a resource.") getCmd.PersistentFlags().Int("max-retry", 5, "Number of retry if error happen when executing HTTP request.") - getCmd.PersistentFlags().Int("http-timeout", 30, "Number of seconds to wait before timing out a request.") + getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request.") getCmd.PersistentFlags().Bool("domains-crawl", false, "If this is turned on, seeds will be treated as domains to crawl, therefore same-domain outlinks will be added to the queue as hop=0.") getCmd.PersistentFlags().StringSlice("disable-html-tag", []string{}, "Specify HTML tag to not extract assets from") getCmd.PersistentFlags().Bool("capture-alternate-pages", false, "If turned on, HTML tags with \"alternate\" values for their \"rel\" attribute will be archived.") diff --git a/go.mod b/go.mod index 33eec97..1e24390 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.23.1 require ( git.archive.org/wb/gocrawlhq v1.2.7 - github.com/CorentinB/warc v0.8.44 + github.com/CorentinB/warc v0.8.45 github.com/PuerkitoBio/goquery v1.9.2 github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 github.com/clbanning/mxj/v2 v2.7.0 @@ -14,6 +14,7 @@ require ( github.com/gosuri/uilive v0.0.4 github.com/gosuri/uitable v0.0.4 github.com/grafana/pyroscope-go v1.1.2 + github.com/grafov/m3u8 v0.12.0 github.com/paulbellamy/ratecounter v0.2.0 github.com/philippgille/gokv/leveldb v0.7.0 github.com/prometheus/client_golang v1.20.2 @@ -49,7 +50,6 @@ require ( github.com/golang/snappy v0.0.4 // indirect github.com/gomodule/redigo v1.9.2 // indirect github.com/grafana/pyroscope-go/godeltaprof v0.1.8 // indirect - github.com/grafov/m3u8 v0.12.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect diff --git a/go.sum b/go.sum index 675ea40..952c69d 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ git.archive.org/wb/gocrawlhq v1.2.7 h1:+LGu6hcG4xpyHFvmk3TCTEmU90wwWj1RW9PPqWVx9TQ= git.archive.org/wb/gocrawlhq v1.2.7/go.mod h1:ursn4DkepW9Z6kKMp5qfeZc2+75gcSBmFgoIWGt2sWA= -github.com/CorentinB/warc v0.8.44 h1:dxtImoHbCDQh84yp6XSnHiBP/MGQypJNw9Hovg2zA+Y= -github.com/CorentinB/warc v0.8.44/go.mod h1:V9uPnP4mv6t1VgqrOSJK4wkPajVxhNz5GTrfcIALOXU= +github.com/CorentinB/warc v0.8.45 h1:AqhjgyLyvF2FKj4iI0nAaLGNmoS9wMEVFw4I3Uf9qr8= +github.com/CorentinB/warc v0.8.45/go.mod h1:V9uPnP4mv6t1VgqrOSJK4wkPajVxhNz5GTrfcIALOXU= github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE= github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index 985ae78..faac6a7 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -19,7 +19,7 @@ import ( var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) -func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error { +func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error { var resp *http.Response // Prepare GET request @@ -28,8 +28,16 @@ func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error { return err } - req.Header.Set("Referer", utils.URLToString(item.ParentURL)) - req.Header.Set("User-Agent", c.UserAgent) + // If headers are passed, apply them to the request + // else, apply the default headers + if headers == nil { + for key, value := range headers { + req.Header.Set(key, value) + } + } else { + req.Header.Set("Referer", utils.URLToString(item.ParentURL)) + req.Header.Set("User-Agent", c.UserAgent) + } // Apply cookies obtained from the original URL captured for i := range cookies { @@ -45,23 +53,24 @@ func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error { defer resp.Body.Close() if strings.Contains(resp.Header.Get("Content-Type"), "vnd.apple.mpegurl") { - assets, err := extractor.M3U8(resp) - if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") - } - - c.captureAssets(item, assets, cookies) - - return nil + // assets, err := extractor.M3U8(resp) + // if err != nil { + // c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") + // } + // resp.Body.Close() + + // c.captureAssets(item, assets, cookies) + + // return nil + } else { + // needed for WARC writing + io.Copy(io.Discard, resp.Body) } - // needed for WARC writing - io.Copy(io.Discard, resp.Body) - return nil } -func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie) { +func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) { // TODO: implement a counter for the number of assets // currently being processed // c.Frontier.QueueCount.Incr(int64(len(assets))) @@ -109,7 +118,7 @@ func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*ht } // Capture the asset - err = c.captureAsset(newAsset, cookies) + err = c.captureAsset(newAsset, cookies, headers) if err != nil { c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ "parentHop": item.Hop, diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index 69e000b..b9d744b 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -343,14 +343,36 @@ func (c *Crawl) Capture(item *queue.Item) error { // If it was a YouTube watch page, we potentially want to run it through the YouTube extractor // TODO: support other watch page URLs if strings.Contains(item.URL.Host, "youtube.com") && strings.Contains(item.URL.Path, "/watch") && !c.NoYTDLP { - URLs, rawJSON, err := youtube.Parse(resp.Body) + URLs, rawJSON, HTTPHeaders, err := youtube.Parse(resp.Body) if err != nil { c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing YouTube watch page") return err } + resp.Body.Close() + + // Build the cookies + // cookies := append([]*http.Cookie{}, &http.Cookie{ + // Name: "Accept", + // Value: HTTPHeaders.Accept, + // }, &http.Cookie{ + // Name: "Accept-Language", + // Value: HTTPHeaders.AcceptLanguage, + // }, &http.Cookie{ + // Name: "Sec-Fetch-Mode", + // Value: HTTPHeaders.SecFetchMode, + // }, &http.Cookie{ + // Name: "User-Agent", + // Value: HTTPHeaders.UserAgent, + // }) + + var headers = make(map[string]string) + headers["Accept"] = HTTPHeaders.Accept + headers["Accept-Language"] = HTTPHeaders.AcceptLanguage + headers["Sec-Fetch-Mode"] = HTTPHeaders.SecFetchMode + headers["User-Agent"] = HTTPHeaders.UserAgent if len(URLs) > 0 { - c.captureAssets(item, URLs, resp.Cookies()) + c.captureAssets(item, URLs, resp.Cookies(), headers) } // Write the metadata record for the video @@ -545,7 +567,7 @@ func (c *Crawl) Capture(item *queue.Item) error { } } - c.captureAssets(item, assets, resp.Cookies()) + c.captureAssets(item, assets, resp.Cookies(), nil) return err } diff --git a/internal/pkg/crawl/crawl.go b/internal/pkg/crawl/crawl.go index 3d41f1f..2420db8 100644 --- a/internal/pkg/crawl/crawl.go +++ b/internal/pkg/crawl/crawl.go @@ -121,8 +121,12 @@ func (c *Crawl) Start() (err error) { } }() - c.Client.Timeout = time.Duration(c.HTTPTimeout) * time.Second - c.Log.Info("HTTP client timeout set", "timeout", c.HTTPTimeout) + if c.HTTPTimeout > 0 { + c.Client.Timeout = time.Duration(c.HTTPTimeout) * time.Second + c.Log.Info("Global HTTP client timeout set", "timeout", c.HTTPTimeout) + } else { + c.Log.Info("Global HTTP client timeout not set") + } if c.Proxy != "" { proxyHTTPClientSettings := HTTPClientSettings diff --git a/internal/pkg/crawl/dependencies/ytdlp/model.go b/internal/pkg/crawl/dependencies/ytdlp/model.go index bec84aa..5e6b507 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/model.go +++ b/internal/pkg/crawl/dependencies/ytdlp/model.go @@ -20,14 +20,9 @@ type Video struct { Fragments []struct { URL string `json:"url"` } `json:"fragments"` - HasDrm bool `json:"has_drm"` - Height float64 `json:"height"` - HTTPHeaders struct { - Accept string `json:"Accept"` - AcceptLanguage string `json:"Accept-Language"` - SecFetchMode string `json:"Sec-Fetch-Mode"` - UserAgent string `json:"User-Agent"` - } `json:"http_headers"` + HasDrm bool `json:"has_drm"` + Height float64 `json:"height"` + HTTPHeaders HTTPHeaders `json:"http_headers"` Language interface{} `json:"language"` LanguagePreference float64 `json:"language_preference"` Preference interface{} `json:"preference"` @@ -91,3 +86,10 @@ type Video struct { URL string `json:"url"` } `json:"thumbnails"` } + +type HTTPHeaders struct { + Accept string `json:"Accept"` + AcceptLanguage string `json:"Accept-Language"` + SecFetchMode string `json:"Sec-Fetch-Mode"` + UserAgent string `json:"User-Agent"` +} diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go index 2ae31ef..23dad79 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go +++ b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go @@ -8,7 +8,7 @@ import ( "strconv" ) -func GetJSON(port int) (URLs []string, rawJSON string, err error) { +func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders HTTPHeaders, err error) { // Prepare the command cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port)) @@ -20,22 +20,22 @@ func GetJSON(port int) (URLs []string, rawJSON string, err error) { // Run the command err = cmd.Run() if err != nil { - return URLs, rawJSON, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String()) + return URLs, rawJSON, HTTPHeaders, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String()) } output := stdout.String() // Find subtitles - // subtitleURLs, err := parseSubtitles(output) - // if err != nil { - // return nil, rawJSON, fmt.Errorf("error parsing subtitles: %v", err) - // } + subtitleURLs, err := parseSubtitles(output) + if err != nil { + return nil, rawJSON, HTTPHeaders, fmt.Errorf("error parsing subtitles: %v", err) + } // Parse the output as a Video object var video Video err = json.Unmarshal([]byte(output), &video) if err != nil { - return nil, rawJSON, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) + return nil, rawJSON, HTTPHeaders, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) } // Get all thumbnail URLs @@ -46,14 +46,23 @@ func GetJSON(port int) (URLs []string, rawJSON string, err error) { // Get the manifest URL for the best video & audio quality // Note: we do not archive live streams if !video.IsLive { - for _, format := range video.RequestedFormats { - URLs = append(URLs, format.URL, format.URL+"&video_id="+video.ID) + if len(video.RequestedFormats) > 0 { + HTTPHeaders = video.RequestedFormats[0].HTTPHeaders + for _, format := range video.RequestedFormats { + URLs = append(URLs, format.URL, format.URL+"&video_id="+video.ID) + } } } - //URLs = append(URLs, subtitleURLs...) + // write output to a .json file (debug) + // err = ioutil.WriteFile("output.json", []byte(output), 0644) + // if err != nil { + // return nil, rawJSON, HTTPHeaders, fmt.Errorf("error writing output.json: %v", err) + // } + + URLs = append(URLs, subtitleURLs...) - return URLs, output, nil + return URLs, output, HTTPHeaders, nil } func FindPath() (string, bool) { diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube.go b/internal/pkg/crawl/sitespecific/youtube/youtube.go index 6424000..643ae05 100644 --- a/internal/pkg/crawl/sitespecific/youtube/youtube.go +++ b/internal/pkg/crawl/sitespecific/youtube/youtube.go @@ -7,29 +7,29 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp" ) -func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, err error) { +func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, HTTPHeaders ytdlp.HTTPHeaders, err error) { // Create a temporary server to serve the body and call ytdlp on it port, stopChan, err := ytdlp.ServeBody(body) if err != nil { - return nil, rawJSON, err + return nil, rawJSON, HTTPHeaders, err } defer close(stopChan) // Call ytdlp on the temporary server - rawURLs, rawJSON, err := ytdlp.GetJSON(port) + rawURLs, rawJSON, HTTPHeaders, err := ytdlp.GetJSON(port) if err != nil { - return nil, rawJSON, err + return nil, rawJSON, HTTPHeaders, err } // Parse the URLs for _, urlString := range rawURLs { URL, err := url.Parse(urlString) if err != nil { - return nil, rawJSON, err + return nil, rawJSON, HTTPHeaders, err } URLs = append(URLs, URL) } - return URLs, rawJSON, nil + return URLs, rawJSON, HTTPHeaders, nil }