diff --git a/cmd/get.go b/cmd/get.go index 1b8dc6cf..fc273e2c 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -79,6 +79,7 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().Bool("disable-assets-capture", false, "Disable assets capture.") getCmd.PersistentFlags().Int("warc-dedupe-size", 1024, "Minimum size to deduplicate WARC records with revisit records.") getCmd.PersistentFlags().String("cdx-cookie", "", "Pass custom cookie during CDX requests. Example: 'cdx_auth_token=test_value'") + getCmd.PersistentFlags().Int("warc-size", 1024, "Size of the WARC files in MB.") // Logging flags getCmd.PersistentFlags().Bool("live-stats", false, "Enable live stats but disable logging. (implies --no-stdout-log)") diff --git a/config/config.go b/config/config.go index 93203594..da36d34a 100644 --- a/config/config.go +++ b/config/config.go @@ -24,6 +24,7 @@ type Config struct { WARCOperator string `mapstructure:"warc-operator"` CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"` WARCTempDir string `mapstructure:"warc-temp-dir"` + WARCSize int `mapstructure:"warc-size"` CDXCookie string `mapstructure:"cdx-cookie"` HQAddress string `mapstructure:"hq-address"` HQKey string `mapstructure:"hq-key"` diff --git a/internal/pkg/crawl/config.go b/internal/pkg/crawl/config.go index 218037c9..aa1108d5 100644 --- a/internal/pkg/crawl/config.go +++ b/internal/pkg/crawl/config.go @@ -102,6 +102,7 @@ type Crawl struct { WARCFullOnDisk bool WARCPoolSize int WARCDedupeSize int + WARCSize int DisableLocalDedupe bool CertValidation bool WARCCustomCookie string @@ -253,6 +254,7 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { c.WARCPoolSize = config.WARCPoolSize c.WARCDedupeSize = config.WARCDedupeSize c.WARCCustomCookie = config.CDXCookie + c.WARCSize = config.WARCSize c.API = config.API c.APIPort = config.APIPort diff --git a/internal/pkg/crawl/warc.go b/internal/pkg/crawl/warc.go index 624a54b0..b5a0a616 100644 --- a/internal/pkg/crawl/warc.go +++ b/internal/pkg/crawl/warc.go @@ -16,6 +16,7 @@ func (c *Crawl) initWARCRotatorSettings() *warc.RotatorSettings { rotatorSettings.Prefix = c.WARCPrefix rotatorSettings.WarcinfoContent.Set("software", fmt.Sprintf("Zeno %s", utils.GetVersion().Version)) rotatorSettings.WARCWriterPoolSize = c.WARCPoolSize + rotatorSettings.WarcSize = float64(c.WARCSize) if len(c.WARCOperator) > 0 { rotatorSettings.WarcinfoContent.Set("operator", c.WARCOperator)