From f7f75a4f783608aa64221ad085bcdb9e2a0511d6 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tg@isi.edu>
Date: Sun, 14 Jun 2020 15:11:53 -0700
Subject: [PATCH] Update docs for spark and term_freqs

---
 README.md            | 54 +++++++++++++++++++++++++++++++++++++++++++-
 nlcodec/term_freq.py |  9 +++++---
 2 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 9e5667a..9fd7527 100644
--- a/README.md
+++ b/README.md
@@ -99,7 +99,6 @@ head  somefile.tok  | nlcodec decode -m bpe.model -idx
 
 # estimate quality 
 head  somefile.tok  | nlcodec estimate -m bpe.model
-
 ```
 
 ## Python API
@@ -150,6 +149,59 @@ some_type.get_permutations(name=False)
 
 ```
 
+## Scaling for Big data(sets)
+
+For larger datasets, you may take advantage of PySpark to compute term-frequencies on a separate step.
+The precomputed term frequencies can be specified to `nlcodec learn -tfs` i.e. by setting `-tfs` flag.
+
+To compute term frequencies
+- Install PySpark using `pip install pyspark`
+- Compute term frequencies 
+
+```bash
+
+$ python -m nlcodec.term_freq -h
+usage: term_freq.py [-h] [-i INP [INP ...]] [-wf WORD_FREQS] [-cf CHAR_FREQS]
+                    [-dd] [-ndd]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i INP [INP ...], --inp INP [INP ...]
+                        Input file paths (default: None)
+  -wf WORD_FREQS, --word_freqs WORD_FREQS
+                        Output file path for word frequencies (default: None)
+  -cf CHAR_FREQS, --char_freqs CHAR_FREQS
+                        Output file path for character frequencies (default:
+                        None)
+  -dd, --dedup          Deduplicate the sentences: use only unique sequences
+                        (default: True)
+  -ndd, --no-dedup      Do not deduplicate. (default: False)
+
+```
+### Example
+```bash
+# use these environment vars
+export SPARK_DRIVER_MEM="4g"
+export SPARK_MATSER="local[*]"   # all CPU cores of local node
+python -m nlcodec.term_freq -dd -wf words.tsv -cf chars.tsv \
+    -i ~/work/datasets/wmt/data/*-*/*.en.tok 
+```
+`words.tsv` and `chars.tsv` have the word  and character frequencies respectively.
+```bash
+# word vocab of 32K
+python -m nlcodec learn -i words.tsv -tfs -l word -vs 32000 -m word.model
+
+# Character vocab of 99.95% coverage
+python -m nlcodec learn -i chars.tsv -tfs -l char  -mf 1 -cv 0.9995 -m char.model
+
+# BPE vocab of 8K 
+python -m nlcodec learn -i words.tsv -tfs -l bpe -vs 8000 -m bpe.model
+
+# BPE vocab until minimum merge frequency is 100; set -vs=64000  as some large number 
+python -m nlcodec learn -i words.tsv -tfs -l bpe -vs 64000 -m bpe.model -cv 0.99995 -mce 100
+
+```
+
 
 # Authors 
 + [Thamme Gowda](https://twitter.com/thammegowda) 
diff --git a/nlcodec/term_freq.py b/nlcodec/term_freq.py
index 2656d47..ec9e91b 100644
--- a/nlcodec/term_freq.py
+++ b/nlcodec/term_freq.py
@@ -10,12 +10,15 @@
 import json
 
 SPARK_DRIVER_MEM = os.environ.get("SPARK_DRIVER_MEM", "4g")
+SPARK_MASTER = os.environ.get("SPARK_MASTER", "local[*]")
+SPARK_APP_NAME = os.environ.get("SPARK_APP", "NLCoDec")
 log.basicConfig(level=log.INFO)
 
-def get_spark():
+def get_spark(app_name=SPARK_APP_NAME, master=SPARK_MASTER, driver_mem=SPARK_DRIVER_MEM):
     return SparkSession.builder \
-        .appName("NL Codec") \
-        .config("spark.driver.memory", SPARK_DRIVER_MEM) \
+        .appName(app_name) \
+        .master(master)\
+        .config("spark.driver.memory", driver_mem) \
         .getOrCreate()
 
 def word_counts(paths: List[Path], dedup=True):