Skip to content

Commit

Permalink
chore: small refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Sep 10, 2024
1 parent 21847ce commit 59c402f
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 17 deletions.
19 changes: 2 additions & 17 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,29 +342,14 @@ func (c *Crawl) Capture(item *queue.Item) error {

// If it was a YouTube watch page, we potentially want to run it through the YouTube extractor
// TODO: support other watch page URLs
if strings.Contains(item.URL.Host, "youtube.com") && strings.Contains(item.URL.Path, "/watch") && !c.NoYTDLP {
if !c.NoYTDLP && youtube.IsYouTubeWatchPage(item.URL) {
URLs, rawJSON, HTTPHeaders, err := youtube.Parse(resp.Body)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing YouTube watch page")
return err
}
resp.Body.Close()

// Build the cookies
// cookies := append([]*http.Cookie{}, &http.Cookie{
// Name: "Accept",
// Value: HTTPHeaders.Accept,
// }, &http.Cookie{
// Name: "Accept-Language",
// Value: HTTPHeaders.AcceptLanguage,
// }, &http.Cookie{
// Name: "Sec-Fetch-Mode",
// Value: HTTPHeaders.SecFetchMode,
// }, &http.Cookie{
// Name: "User-Agent",
// Value: HTTPHeaders.UserAgent,
// })

var headers = make(map[string]string)
headers["Accept"] = HTTPHeaders.Accept
headers["Accept-Language"] = HTTPHeaders.AcceptLanguage
Expand All @@ -377,7 +362,7 @@ func (c *Crawl) Capture(item *queue.Item) error {

// Write the metadata record for the video
if rawJSON != "" {
c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dl", rawJSON)
c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dlp", rawJSON)
}

return nil
Expand Down
5 changes: 5 additions & 0 deletions internal/pkg/crawl/sitespecific/youtube/youtube.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@ package youtube
import (
"io"
"net/url"
"strings"

"github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp"
)

func IsYouTubeWatchPage(URL *url.URL) bool {
return strings.Contains(URL.Host, "youtube.com") && (strings.Contains(URL.Path, "/watch") || strings.Contains(URL.Path, "/v/"))
}

func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, HTTPHeaders ytdlp.HTTPHeaders, err error) {
// Create a temporary server to serve the body and call ytdlp on it
port, stopChan, err := ytdlp.ServeBody(body)
Expand Down

0 comments on commit 59c402f

Please sign in to comment.