From b17ca8c59d844022837b2847ff9cadf2f5a51699 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Mon, 1 Apr 2024 15:22:10 +0200 Subject: [PATCH] Add: --hq-rate-limiting-send-back --- cmd/all/all.go | 4 +- cmd/cmd.go | 7 +++- cmd/get/get.go | 6 +-- cmd/get/hq.go | 6 +-- cmd/get/list.go | 8 ++-- cmd/get/url.go | 10 ++--- cmd/utils.go | 9 +++-- cmd/version/version.go | 4 +- config/config.go | 17 ++++---- go.mod | 2 +- internal/pkg/crawl/assets.go | 6 +-- internal/pkg/crawl/capture.go | 39 ++++++++++++------- internal/pkg/crawl/crawl.go | 29 +++++++------- internal/pkg/crawl/hq.go | 27 +++++++++++-- internal/pkg/crawl/log.go | 4 +- internal/pkg/crawl/outlinks.go | 8 ++-- .../cloudflarestream/cloudflarestream.go | 2 +- internal/pkg/crawl/utils.go | 2 +- internal/pkg/crawl/warc.go | 2 +- internal/pkg/crawl/worker.go | 2 +- internal/pkg/frontier/frontier.go | 2 +- internal/pkg/frontier/item.go | 24 ++++++------ internal/pkg/frontier/utils.go | 2 +- internal/pkg/utils/utils.go | 6 +-- main.go | 6 +-- 25 files changed, 138 insertions(+), 96 deletions(-) diff --git a/cmd/all/all.go b/cmd/all/all.go index 42358acb..67598d24 100644 --- a/cmd/all/all.go +++ b/cmd/all/all.go @@ -1,6 +1,6 @@ package all import ( - _ "github.com/CorentinB/Zeno/cmd/get" - _ "github.com/CorentinB/Zeno/cmd/version" + _ "github.com/internetarchive/Zeno/cmd/get" + _ "github.com/internetarchive/Zeno/cmd/version" ) diff --git a/cmd/cmd.go b/cmd/cmd.go index ded21c30..b6547af9 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -6,7 +6,7 @@ import ( "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" - "github.com/CorentinB/Zeno/config" + "github.com/internetarchive/Zeno/config" ) var GlobalFlags = []cli.Flag{ @@ -288,6 +288,11 @@ var GlobalFlags = []cli.Flag{ Value: "lifo", Destination: &config.App.Flags.HQStrategy, }, + &cli.BoolFlag{ + Name: "hq-rate-limiting-send-back", + Usage: "If turned on, the crawler will send back URLs that hit a rate limit to crawl HQ.", + Destination: &config.App.Flags.HQRateLimitingSendBack, + }, &cli.StringFlag{ Name: "es-url", Usage: "ElasticSearch URL to use for indexing crawl logs.", diff --git a/cmd/get/get.go b/cmd/get/get.go index ac01d6da..09750d75 100644 --- a/cmd/get/get.go +++ b/cmd/get/get.go @@ -1,13 +1,13 @@ package get import ( - "github.com/CorentinB/Zeno/cmd" - "github.com/CorentinB/Zeno/config" + "github.com/internetarchive/Zeno/cmd" + "github.com/internetarchive/Zeno/config" log "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" ) -func initLogging(c *cli.Context) (err error) { +func initLogging() (err error) { // Log as JSON instead of the default ASCII formatter. if config.App.Flags.JSON { log.SetFormatter(&log.JSONFormatter{}) diff --git a/cmd/get/hq.go b/cmd/get/hq.go index 99850010..6c7dc208 100644 --- a/cmd/get/hq.go +++ b/cmd/get/hq.go @@ -1,8 +1,8 @@ package get import ( - "github.com/CorentinB/Zeno/cmd" - "github.com/CorentinB/Zeno/config" + "github.com/internetarchive/Zeno/cmd" + "github.com/internetarchive/Zeno/config" "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" @@ -19,7 +19,7 @@ func newGetHQCmd() *cli.Command { } func cmdGetHQ(c *cli.Context) error { - err := initLogging(c) + err := initLogging() if err != nil { log.Error("Unable to parse arguments") return err diff --git a/cmd/get/list.go b/cmd/get/list.go index 83518fff..bcacf64f 100644 --- a/cmd/get/list.go +++ b/cmd/get/list.go @@ -1,9 +1,9 @@ package get import ( - "github.com/CorentinB/Zeno/cmd" - "github.com/CorentinB/Zeno/config" - "github.com/CorentinB/Zeno/internal/pkg/frontier" + "github.com/internetarchive/Zeno/cmd" + "github.com/internetarchive/Zeno/config" + "github.com/internetarchive/Zeno/internal/pkg/frontier" "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" @@ -20,7 +20,7 @@ func newGetListCmd() *cli.Command { } func cmdGetList(c *cli.Context) error { - err := initLogging(c) + err := initLogging() if err != nil { log.Error("Unable to parse arguments") return err diff --git a/cmd/get/url.go b/cmd/get/url.go index 99ce557a..39749c59 100644 --- a/cmd/get/url.go +++ b/cmd/get/url.go @@ -3,9 +3,9 @@ package get import ( "net/url" - "github.com/CorentinB/Zeno/cmd" - "github.com/CorentinB/Zeno/config" - "github.com/CorentinB/Zeno/internal/pkg/frontier" + "github.com/internetarchive/Zeno/cmd" + "github.com/internetarchive/Zeno/config" + "github.com/internetarchive/Zeno/internal/pkg/frontier" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" ) @@ -21,7 +21,7 @@ func newGetURLCmd() *cli.Command { } func cmdGetURL(c *cli.Context) error { - err := initLogging(c) + err := initLogging() if err != nil { logrus.Error("Unable to parse arguments") return err @@ -40,7 +40,7 @@ func cmdGetURL(c *cli.Context) error { return err } - crawl.SeedList = append(crawl.SeedList, *frontier.NewItem(input, nil, "seed", 0, "")) + crawl.SeedList = append(crawl.SeedList, *frontier.NewItem(input, nil, "seed", 0, "", false)) // Start crawl err = crawl.Start() diff --git a/cmd/utils.go b/cmd/utils.go index 783bcfcd..440eedee 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -4,11 +4,11 @@ import ( "path" "time" - "github.com/CorentinB/Zeno/config" - "github.com/CorentinB/Zeno/internal/pkg/crawl" - "github.com/CorentinB/Zeno/internal/pkg/frontier" - "github.com/CorentinB/Zeno/internal/pkg/utils" "github.com/google/uuid" + "github.com/internetarchive/Zeno/config" + "github.com/internetarchive/Zeno/internal/pkg/crawl" + "github.com/internetarchive/Zeno/internal/pkg/frontier" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/paulbellamy/ratecounter" "github.com/remeh/sizedwaitgroup" "github.com/sirupsen/logrus" @@ -130,6 +130,7 @@ func InitCrawlWithCMD(flags config.Flags) *crawl.Crawl { c.HQStrategy = flags.HQStrategy c.HQBatchSize = int(flags.HQBatchSize) c.HQContinuousPull = flags.HQContinuousPull + c.HQRateLimitingSendBack = flags.HQRateLimitingSendBack return c } diff --git a/cmd/version/version.go b/cmd/version/version.go index 59b9b1a6..7dd57c4a 100644 --- a/cmd/version/version.go +++ b/cmd/version/version.go @@ -1,8 +1,8 @@ package version import ( - "github.com/CorentinB/Zeno/cmd" - "github.com/CorentinB/Zeno/internal/pkg/utils" + "github.com/internetarchive/Zeno/cmd" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/urfave/cli/v2" ) diff --git a/config/config.go b/config/config.go index 5f2f2601..1063f169 100644 --- a/config/config.go +++ b/config/config.go @@ -45,14 +45,15 @@ type Flags struct { WARCTempDir string WARCCustomCookie string - UseHQ bool - HQBatchSize int64 - HQAddress string - HQProject string - HQKey string - HQSecret string - HQStrategy string - HQContinuousPull bool + UseHQ bool + HQBatchSize int64 + HQAddress string + HQProject string + HQKey string + HQSecret string + HQStrategy string + HQContinuousPull bool + HQRateLimitingSendBack bool CDXDedupeServer string DisableLocalDedupe bool diff --git a/go.mod b/go.mod index 007b5428..a83d7307 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module github.com/CorentinB/Zeno +module github.com/internetarchive/Zeno go 1.22 diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index abf6eac4..a6d36cdb 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -5,10 +5,10 @@ import ( "regexp" "strings" - "github.com/CorentinB/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" - "github.com/CorentinB/Zeno/internal/pkg/frontier" - "github.com/CorentinB/Zeno/internal/pkg/utils" "github.com/PuerkitoBio/goquery" + "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" + "github.com/internetarchive/Zeno/internal/pkg/frontier" + "github.com/internetarchive/Zeno/internal/pkg/utils" ) func (c *Crawl) extractAssets(base *url.URL, item *frontier.Item, doc *goquery.Document) (assets []*url.URL, err error) { diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index 828002fd..7cff8bf4 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -11,17 +11,17 @@ import ( "sync/atomic" "time" - "github.com/CorentinB/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" - "github.com/CorentinB/Zeno/internal/pkg/crawl/sitespecific/telegram" - "github.com/CorentinB/Zeno/internal/pkg/crawl/sitespecific/tiktok" - "github.com/CorentinB/Zeno/internal/pkg/crawl/sitespecific/vk" - "github.com/CorentinB/Zeno/internal/pkg/utils" "github.com/PuerkitoBio/goquery" "github.com/clbanning/mxj/v2" + "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" + "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/telegram" + "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/tiktok" + "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/vk" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/remeh/sizedwaitgroup" "github.com/tomnomnom/linkheader" - "github.com/CorentinB/Zeno/internal/pkg/frontier" + "github.com/internetarchive/Zeno/internal/pkg/frontier" ) func (c *Crawl) executeGET(item *frontier.Item, req *http.Request, isRedirection bool) (resp *http.Response, err error) { @@ -98,22 +98,31 @@ func (c *Crawl) executeGET(item *frontier.Item, req *http.Request, isRedirection "sleepTime": sleepTime.String(), "retryCount": retry, "statusCode": resp.StatusCode, - })).Warn("we are being rate limited, sleeping then retrying..") + })).Debugf("we are being rate limited") // This ensures we aren't leaving the warc dialer hanging. // Do note, 429s are filtered out by WARC writer regardless. io.Copy(io.Discard, resp.Body) resp.Body.Close() - time.Sleep(sleepTime) + // If --hq-rate-limiting-send-back is enabled, we send the URL back to HQ + if c.UseHQ && c.HQRateLimitingSendBack { + return nil, errors.New("URL is being rate limited, sending back to HQ") + } else { + logWarning.WithFields(c.genLogFields(err, req.URL, map[string]interface{}{ + "sleepTime": sleepTime.String(), + "retryCount": retry, + "statusCode": resp.StatusCode, + })).Warn("URL is being rate limited") + } + continue } else { + c.logCrawlSuccess(executionStart, resp.StatusCode, item) break } } - c.logCrawlSuccess(executionStart, resp.StatusCode, item) - // If a redirection is catched, then we execute the redirection if isStatusCodeRedirect(resp.StatusCode) { if resp.Header.Get("location") == utils.URLToString(req.URL) || item.Redirect >= c.MaxRedirect { @@ -153,7 +162,7 @@ func (c *Crawl) executeGET(item *frontier.Item, req *http.Request, isRedirection } } - newItem = frontier.NewItem(URL, item, item.Type, item.Hop, item.ID) + newItem = frontier.NewItem(URL, item, item.Type, item.Hop, item.ID, false) newItem.Redirect = item.Redirect + 1 // Prepare GET request @@ -239,7 +248,7 @@ func (c *Crawl) Capture(item *frontier.Item) { telegram.TransformURL(item.URL) // Then we create an item - embedItem := frontier.NewItem(item.URL, item, item.Type, item.Hop, item.ID) + embedItem := frontier.NewItem(item.URL, item, item.Type, item.Hop, item.ID, false) // And capture it c.Capture(embedItem) @@ -251,6 +260,10 @@ func (c *Crawl) Capture(item *frontier.Item) { resp, err = c.executeGET(item, req, false) if err != nil && err.Error() == "URL from redirection has already been seen" { return + } else if err != nil && err.Error() == "URL is being rate limited, sending back to HQ" { + c.HQFinishedChannel <- item + c.HQProducerChannel <- frontier.NewItem(item.URL, item.ParentItem, item.Type, item.Hop, "", true) + return } else if err != nil { logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while executing GET request") return @@ -503,7 +516,7 @@ func (c *Crawl) Capture(item *frontier.Item) { defer swg.Done() // Create the asset's item - newAsset := frontier.NewItem(asset, item, "asset", item.Hop, "") + newAsset := frontier.NewItem(asset, item, "asset", item.Hop, "", false) // Capture the asset err = c.captureAsset(newAsset, resp.Cookies()) diff --git a/internal/pkg/crawl/crawl.go b/internal/pkg/crawl/crawl.go index a20055d0..6b2d657c 100644 --- a/internal/pkg/crawl/crawl.go +++ b/internal/pkg/crawl/crawl.go @@ -6,9 +6,9 @@ import ( "time" "git.archive.org/wb/gocrawlhq" - "github.com/CorentinB/Zeno/internal/pkg/frontier" - "github.com/CorentinB/Zeno/internal/pkg/utils" "github.com/CorentinB/warc" + "github.com/internetarchive/Zeno/internal/pkg/frontier" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/paulbellamy/ratecounter" "github.com/prometheus/client_golang/prometheus" "github.com/remeh/sizedwaitgroup" @@ -105,18 +105,19 @@ type Crawl struct { WARCCustomCookie string // Crawl HQ settings - UseHQ bool - HQAddress string - HQProject string - HQKey string - HQSecret string - HQStrategy string - HQBatchSize int - HQContinuousPull bool - HQClient *gocrawlhq.Client - HQFinishedChannel chan *frontier.Item - HQProducerChannel chan *frontier.Item - HQChannelsWg *sync.WaitGroup + UseHQ bool + HQAddress string + HQProject string + HQKey string + HQSecret string + HQStrategy string + HQBatchSize int + HQContinuousPull bool + HQClient *gocrawlhq.Client + HQFinishedChannel chan *frontier.Item + HQProducerChannel chan *frontier.Item + HQChannelsWg *sync.WaitGroup + HQRateLimitingSendBack bool } // Start fire up the crawling process diff --git a/internal/pkg/crawl/hq.go b/internal/pkg/crawl/hq.go index 123d5d89..5dbe6707 100644 --- a/internal/pkg/crawl/hq.go +++ b/internal/pkg/crawl/hq.go @@ -8,8 +8,8 @@ import ( "time" "git.archive.org/wb/gocrawlhq" - "github.com/CorentinB/Zeno/internal/pkg/frontier" - "github.com/CorentinB/Zeno/internal/pkg/utils" + "github.com/internetarchive/Zeno/internal/pkg/frontier" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/sirupsen/logrus" ) @@ -104,15 +104,34 @@ func (c *Crawl) HQProducer() { // listen to the discovered channel and add the URLs to the discoveredArray for discoveredItem := range c.HQProducerChannel { + var via string + + if discoveredItem.ParentItem != nil { + via = utils.URLToString(discoveredItem.ParentItem.URL) + } + discoveredURL := gocrawlhq.URL{ Value: utils.URLToString(discoveredItem.URL), - Via: utils.URLToString(discoveredItem.ParentItem.URL), + Via: via, } for i := 0; uint8(i) < discoveredItem.Hop; i++ { discoveredURL.Path += "L" } + if discoveredItem.BypassSeencheck { + for { + _, err := c.HQClient.Discovered([]gocrawlhq.URL{discoveredURL}, "seed", true, false) + if err != nil { + logrus.WithFields(c.genLogFields(err, nil, nil)).Errorln("error sending payload to crawl HQ, waiting 1s then retrying..") + time.Sleep(time.Second) + continue + } + break + } + continue + } + mutex.Lock() discoveredArray = append(discoveredArray, discoveredURL) mutex.Unlock() @@ -169,7 +188,7 @@ func (c *Crawl) HQConsumer() { })).Errorln("unable to parse URL received from crawl HQ, discarding") } - c.Frontier.PushChan <- frontier.NewItem(newURL, nil, "seed", uint8(strings.Count(URL.Path, "L")), URL.ID) + c.Frontier.PushChan <- frontier.NewItem(newURL, nil, "seed", uint8(strings.Count(URL.Path, "L")), URL.ID, false) } } } diff --git a/internal/pkg/crawl/log.go b/internal/pkg/crawl/log.go index 9c15c466..f0e74c2a 100644 --- a/internal/pkg/crawl/log.go +++ b/internal/pkg/crawl/log.go @@ -5,9 +5,9 @@ import ( "sync" "time" - "github.com/CorentinB/Zeno/internal/pkg/frontier" - "github.com/CorentinB/Zeno/internal/pkg/utils" "github.com/CorentinB/warc" + "github.com/internetarchive/Zeno/internal/pkg/frontier" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/sirupsen/logrus" ) diff --git a/internal/pkg/crawl/outlinks.go b/internal/pkg/crawl/outlinks.go index 023ea5e7..f19de5da 100644 --- a/internal/pkg/crawl/outlinks.go +++ b/internal/pkg/crawl/outlinks.go @@ -6,9 +6,9 @@ import ( "strings" "sync" - "github.com/CorentinB/Zeno/internal/pkg/frontier" - "github.com/CorentinB/Zeno/internal/pkg/utils" "github.com/PuerkitoBio/goquery" + "github.com/internetarchive/Zeno/internal/pkg/frontier" + "github.com/internetarchive/Zeno/internal/pkg/utils" ) func extractOutlinks(base *url.URL, doc *goquery.Document) (outlinks []*url.URL, err error) { @@ -83,14 +83,14 @@ func (c *Crawl) queueOutlinks(outlinks []*url.URL, item *frontier.Item, wg *sync } if c.DomainsCrawl && strings.Contains(item.Host, outlink.Host) && item.Hop == 0 { - newItem := frontier.NewItem(outlink, item, "seed", 0, "") + newItem := frontier.NewItem(outlink, item, "seed", 0, "", false) if c.UseHQ { c.HQProducerChannel <- newItem } else { c.Frontier.PushChan <- newItem } } else if c.MaxHops >= item.Hop+1 { - newItem := frontier.NewItem(outlink, item, "seed", item.Hop+1, "") + newItem := frontier.NewItem(outlink, item, "seed", item.Hop+1, "", false) if c.UseHQ { c.HQProducerChannel <- newItem } else { diff --git a/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go b/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go index 4bcdc31f..aa43f28c 100644 --- a/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go +++ b/internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go @@ -10,9 +10,9 @@ import ( "strconv" "strings" - "github.com/CorentinB/Zeno/internal/pkg/utils" "github.com/CorentinB/warc" "github.com/PuerkitoBio/goquery" + "github.com/internetarchive/Zeno/internal/pkg/utils" ) type MPD struct { diff --git a/internal/pkg/crawl/utils.go b/internal/pkg/crawl/utils.go index 5f463deb..83069209 100644 --- a/internal/pkg/crawl/utils.go +++ b/internal/pkg/crawl/utils.go @@ -6,7 +6,7 @@ import ( "strconv" "time" - "github.com/CorentinB/Zeno/internal/pkg/utils" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/sirupsen/logrus" "github.com/zeebo/xxh3" ) diff --git a/internal/pkg/crawl/warc.go b/internal/pkg/crawl/warc.go index b888a07c..29b1f6e0 100644 --- a/internal/pkg/crawl/warc.go +++ b/internal/pkg/crawl/warc.go @@ -4,8 +4,8 @@ import ( "fmt" "path" - "github.com/CorentinB/Zeno/internal/pkg/utils" "github.com/CorentinB/warc" + "github.com/internetarchive/Zeno/internal/pkg/utils" ) func (c *Crawl) initWARCRotatorSettings() *warc.RotatorSettings { diff --git a/internal/pkg/crawl/worker.go b/internal/pkg/crawl/worker.go index 4a62df58..918c2450 100644 --- a/internal/pkg/crawl/worker.go +++ b/internal/pkg/crawl/worker.go @@ -3,7 +3,7 @@ package crawl import ( "time" - "github.com/CorentinB/Zeno/internal/pkg/utils" + "github.com/internetarchive/Zeno/internal/pkg/utils" ) const ( diff --git a/internal/pkg/frontier/frontier.go b/internal/pkg/frontier/frontier.go index 4a421d5d..a9e73c02 100644 --- a/internal/pkg/frontier/frontier.go +++ b/internal/pkg/frontier/frontier.go @@ -4,8 +4,8 @@ import ( "path" "sync" - "github.com/CorentinB/Zeno/internal/pkg/utils" "github.com/beeker1121/goque" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/paulbellamy/ratecounter" "github.com/philippgille/gokv/leveldb" "github.com/sirupsen/logrus" diff --git a/internal/pkg/frontier/item.go b/internal/pkg/frontier/item.go index b0d312bd..cea90b1d 100644 --- a/internal/pkg/frontier/item.go +++ b/internal/pkg/frontier/item.go @@ -3,25 +3,26 @@ package frontier import ( "net/url" - "github.com/CorentinB/Zeno/internal/pkg/utils" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/zeebo/xxh3" ) // Item is crawl-able object type Item struct { - ID string - Hash uint64 - Hop uint8 - Host string - Type string - Redirect int - URL *url.URL - ParentItem *Item - LocallyCrawled uint64 + ID string + Hash uint64 + Hop uint8 + Host string + Type string + Redirect int + URL *url.URL + ParentItem *Item + LocallyCrawled uint64 + BypassSeencheck bool } // NewItem initialize an *Item -func NewItem(URL *url.URL, parentItem *Item, itemType string, hop uint8, ID string) *Item { +func NewItem(URL *url.URL, parentItem *Item, itemType string, hop uint8, ID string, bypassSeencheck bool) *Item { item := new(Item) item.URL = URL @@ -33,6 +34,7 @@ func NewItem(URL *url.URL, parentItem *Item, itemType string, hop uint8, ID stri item.ParentItem = parentItem item.Hash = xxh3.HashString(utils.URLToString(URL)) item.Type = itemType + item.BypassSeencheck = bypassSeencheck return item } diff --git a/internal/pkg/frontier/utils.go b/internal/pkg/frontier/utils.go index 183fefc2..64bd40f4 100644 --- a/internal/pkg/frontier/utils.go +++ b/internal/pkg/frontier/utils.go @@ -51,7 +51,7 @@ func IsSeedList(path string) (seeds []Item, err error) { continue } - item := NewItem(URL, nil, "seed", 0, "") + item := NewItem(URL, nil, "seed", 0, "", false) seeds = append(seeds, *item) validCount++ fmt.Fprintf(writer, "\t Reading input list.. Found %d valid URLs out of %d URLs read.\n", validCount, totalCount) diff --git a/internal/pkg/utils/utils.go b/internal/pkg/utils/utils.go index 10188901..3cf97ae9 100644 --- a/internal/pkg/utils/utils.go +++ b/internal/pkg/utils/utils.go @@ -28,9 +28,9 @@ func SetupLogging(jobPath string, liveStats bool, esURL string) (logInfo, logWar logWarning = logrus.New() logError = logrus.New() - logInfo.SetFormatter(&logrus.JSONFormatter{}) - logWarning.SetFormatter(&logrus.JSONFormatter{}) - logError.SetFormatter(&logrus.JSONFormatter{}) + //logInfo.SetFormatter(&logrus.JSONFormatter{}) + //logWarning.SetFormatter(&logrus.JSONFormatter{}) + //logError.SetFormatter(&logrus.JSONFormatter{}) if esURL != "" { client, err := elastic.NewClient(elastic.SetURL(esURL)) diff --git a/main.go b/main.go index 444b69fc..e4a8e759 100644 --- a/main.go +++ b/main.go @@ -5,9 +5,9 @@ import ( _ "net/http/pprof" - "github.com/CorentinB/Zeno/cmd" - _ "github.com/CorentinB/Zeno/cmd/all" - "github.com/CorentinB/Zeno/internal/pkg/utils" + "github.com/internetarchive/Zeno/cmd" + _ "github.com/internetarchive/Zeno/cmd/all" + "github.com/internetarchive/Zeno/internal/pkg/utils" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" )