diff --git a/docs/cli.md b/docs/cli.md index 23ff1d96e0..40b4158b1b 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -1121,9 +1121,10 @@ For more documentation about minet's scraping DSL check this [page](../cookbook/ ``` Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND] - [--simple-progress] [-m] [-g] [-I INPUT_DIR] [-p PROCESSES] - [--chunk-size CHUNK_SIZE] [--body-column BODY_COLUMN] - [--url-column URL_COLUMN] [--error-column ERROR_COLUMN] + [--simple-progress] [-m] [-e] [-g] [-I INPUT_DIR] + [-p PROCESSES] [--chunk-size CHUNK_SIZE] + [--body-column BODY_COLUMN] [--url-column URL_COLUMN] + [--error-column ERROR_COLUMN] [--status-column STATUS_COLUMN] [--encoding-column ENCODING_COLUMN] [--mimetype-column MIMETYPE_COLUMN] [--encoding ENCODING] @@ -1138,8 +1139,9 @@ Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND] Use multiple processes to scrape data from a batch of HTML files using minet scraping DSL documented here: https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md -or a python function given using the -m/--module flag, or an already -implemented typical scraping routine (listed below). +or a python function given using the -m/--module flag, or a simple inline +python expression given using the -e/--eval flag, or an already implemented +typical scraping routine (listed below). It will output the scraped items as a CSV or NDJSON file. @@ -1186,6 +1188,8 @@ Optional Arguments: Defaults to `encoding`. --error-column ERROR_COLUMN Name of the CSV column containing a fetch error. Defaults to `fetch_error`. + -e, --eval Whether given scraper should be a simple + expression to evaluate. -f, --format {csv,jsonl,ndjson} Output format. Defaults to `csv`. -g, --glob Will interpret given paths as glob patterns to @@ -1273,10 +1277,13 @@ Examples: $ minet scrape title -i report.csv > titles.csv . Using the `scrape` (default) function of target python module: - $ minet scrape scraper.py -i report.csv > titles.csv + $ minet scrape -m scraper.py -i report.csv > titles.csv . Using the `scrape_title` function of target python module: - $ minet scrape scraper.py:scrape_title -i report.csv > titles.csv + $ minet scrape -m scraper.py:scrape_title -i report.csv > titles.csv + +. Using an inline python expression to evaluate: + $ minet scrape -e 'soup.scrape_one("title")' -i report.csv > titles.csv . Indicating a custom path column (e.g. "file"): $ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv diff --git a/minet/cli/scrape/__init__.py b/minet/cli/scrape/__init__.py index 696524576a..1bcd9e454c 100644 --- a/minet/cli/scrape/__init__.py +++ b/minet/cli/scrape/__init__.py @@ -14,8 +14,9 @@ def resolve_arguments(cli_args): Use multiple processes to scrape data from a batch of HTML files using minet scraping DSL documented here: https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md - or a python function given using the -m/--module flag, or an already - implemented typical scraping routine (listed below). + or a python function given using the -m/--module flag, or a simple inline + python expression given using the -e/--eval flag, or an already implemented + typical scraping routine (listed below). It will output the scraped items as a CSV or NDJSON file. @@ -67,10 +68,13 @@ def resolve_arguments(cli_args): $ minet scrape title -i report.csv > titles.csv . Using the `scrape` (default) function of target python module: - $ minet scrape scraper.py -i report.csv > titles.csv + $ minet scrape -m scraper.py -i report.csv > titles.csv . Using the `scrape_title` function of target python module: - $ minet scrape scraper.py:scrape_title -i report.csv > titles.csv + $ minet scrape -m scraper.py:scrape_title -i report.csv > titles.csv + + . Using an inline python expression to evaluate: + $ minet scrape -e 'soup.scrape_one("title")' -i report.csv > titles.csv . Indicating a custom path column (e.g. "file"): $ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv @@ -108,6 +112,11 @@ def resolve_arguments(cli_args): "help": "Whether given scraper is a python target to import.", "action": "store_true", }, + { + "flags": ["-e", "--eval"], + "help": "Whether given scraper should be a simple expression to evaluate.", + "action": "store_true" + }, { "flags": ["-g", "--glob"], "help": "Will interpret given paths as glob patterns to resolve if given.", diff --git a/minet/cli/scrape/scrape.py b/minet/cli/scrape/scrape.py index 335cc05ad5..0152c91e13 100644 --- a/minet/cli/scrape/scrape.py +++ b/minet/cli/scrape/scrape.py @@ -124,6 +124,8 @@ def action(cli_args): if cli_args.module: fn = import_target(cli_args.scraper, default="scrape") scraper = FunctionScraper(fn, strain=cli_args.strain) + elif cli_args.eval: + scraper = FunctionScraper(cli_args.scraper, strain=cli_args.strain) elif cli_args.scraper in NAMED_SCRAPERS: scraper = NAMED_SCRAPERS[cli_args.scraper]() else: diff --git a/minet/scrape/classes/function.py b/minet/scrape/classes/function.py index 8f76815a37..f3afd69166 100644 --- a/minet/scrape/classes/function.py +++ b/minet/scrape/classes/function.py @@ -36,7 +36,7 @@ def infer_fieldnames_from_function_return_type(fn: Callable) -> Optional[List[st class FunctionScraper(ScraperBase): - fn: Callable[[RowWrapper, WonderfulSoup], Any] + fn: Union[str, Callable[[RowWrapper, WonderfulSoup], Any]] fieldnames = None plural: bool tabular = True @@ -45,9 +45,10 @@ class FunctionScraper(ScraperBase): def __init__( self, - fn: Callable[[RowWrapper, WonderfulSoup], Any], + fn: Union[str, Callable[[RowWrapper, WonderfulSoup], Any]], strain: Optional[str] = None, ): + # NOTE: closures cannot be pickled without using third-party library `dill`. self.fn = fn self.plural = inspect.isgeneratorfunction(fn) @@ -62,4 +63,7 @@ def __call__(self, html: AnyScrapableTarget, context: Optional[Dict] = None): row = context["row"] soup = cast(WonderfulSoup, ensure_soup(html, strainer=self.strainer)) + if isinstance(self.fn, str): + return eval(self.fn, {"row": row, 'soup': soup}, None) + return self.fn(row, soup)