neuralmagic · mgoin · May 10, 2024 · May 10, 2024 · May 10, 2024 · May 10, 2024
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -0,0 +1,32 @@
+name: test
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install .
+        pip install pytest
+    - name: Test
+      run: |
+        pytest tests -s -v
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,189 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+docs/source/getting_started/examples/*.rst
+!**/*.template.rst
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# VSCode
+.vscode/
+
+# DS Store
+.DS_Store
+
+# Results
+*.csv
+
+# Python pickle files
+*.pkl
+
+# Sphinx documentation
+_build/
+
+# vim swap files
+*.swo
+*.swp
+
+# hip files generated by PyTorch
+*.hip
+*_hip*
+hip_compat.h
+
+# Benchmark dataset
+*.json
diff --git a/README.md b/README.md
@@ -1,32 +1,57 @@
 # AutoFP8
 
-Open-source FP8 quantization project for producing compressed checkpoints for running in vLLM - see https://github.com/vllm-project/vllm/pull/4332 for implementation.
+Open-source FP8 quantization library for producing compressed checkpoints for running in vLLM - see https://github.com/vllm-project/vllm/pull/4332 for details on the implementation for inference.
 
-## How to quantize a model
+## Installation
 
-Install this repo's requirements:
+Clone this repo and install it from source:
 ```bash
-pip install -r requirements.txt
+git clone https://github.com/neuralmagic/AutoFP8.git
+pip install -e AutoFP8
 ```
 
-Command to produce a `Meta-Llama-3-8B-Instruct-FP8` quantized LLM:
-```bash
-python quantize.py --model-id meta-llama/Meta-Llama-3-8B-Instruct --save-dir Meta-Llama-3-8B-Instruct-FP8
-```
+A stable release will be published.
+
+## Quickstart
+
+This package introduces the `AutoFP8ForCausalLM` and `BaseQuantizeConfig` objects for managing how your model will be compressed.
+
+Once you load your `AutoFP8ForCausalLM`, you can tokenize your data and provide it to the `model.quantize(tokenized_text)` function to calibrate+compress the model.
+
+Finally, you can save your quantized model in a compressed checkpoint format compatible with vLLM using `model.save_quantized("my_model_fp8")`.
+
+Here is a full example covering that flow:
+
+```python
+from transformers import AutoTokenizer
+from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
 
-Example model checkpoint with FP8 static scales for activations and weights: https://huggingface.co/nm-testing/Meta-Llama-3-8B-Instruct-FP8
+pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8"
 
-All arguments available for `quantize.py`:
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
+examples = ["auto_fp8 is an easy-to-use model quantization library"]
+examples = tokenizer(examples, return_tensors="pt").to("cuda")
+
+quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="dynamic")
+
+model = AutoFP8ForCausalLM.from_pretrained(
+    pretrained_model_dir, quantize_config=quantize_config
+)
+model.quantize(examples)
+model.save_quantized(quantized_model_dir)
 ```
-usage: quantize.py [-h] [--model-id MODEL_ID] [--save-dir SAVE_DIR] [--activation-scheme {static,dynamic}] [--num-samples NUM_SAMPLES] [--max-seq-len MAX_SEQ_LEN]
-
-options:
-  -h, --help            show this help message and exit
-  --model-id MODEL_ID
-  --save-dir SAVE_DIR
-  --activation-scheme {static,dynamic}
-  --num-samples NUM_SAMPLES
-  --max-seq-len MAX_SEQ_LEN
+
+Finally, load it into vLLM for inference! Support began in v0.4.2 (`pip install vllm>=0.4.2`). Note that hardware support for FP8 tensor cores must be available in the GPU you are using (Ada Lovelace, Hopper, and newer).
+
+```python
+from vllm import LLM
+
+model = LLM("Meta-Llama-3-8B-Instruct-FP8")
+# INFO 05-10 18:02:40 model_runner.py:175] Loading model weights took 8.4595 GB
+
+print(model.generate("Once upon a time"))
+# [RequestOutput(request_id=0, prompt='Once upon a time', prompt_token_ids=[128000, 12805, 5304, 264, 892], prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=' there was a man who fell in love with a woman. The man was so', token_ids=[1070, 574, 264, 893, 889, 11299, 304, 3021, 449, 264, 5333, 13, 578, 893, 574, 779], cumulative_logprob=-21.314169232733548, logprobs=None, finish_reason=length, stop_reason=None)], finished=True, metrics=RequestMetrics(arrival_time=1715378569.478381, last_token_time=1715378569.478381, first_scheduled_time=1715378569.480648, first_token_time=1715378569.7070432, time_in_queue=0.002267122268676758, finished_time=1715378570.104807), lora_request=None)]
 ```
 
 ## How to run FP8 quantized models
@@ -36,7 +61,7 @@ options:
 Then simply pass the quantized checkpoint directly to vLLM's entrypoints! It will detect the checkpoint format using the `quantization_config` in the `config.json`.
 ```python
 from vllm import LLM
-model = LLM("nm-testing/Meta-Llama-3-8B-Instruct-FP8")
+model = LLM("neuralmagic/Meta-Llama-3-8B-Instruct-FP8")
 # INFO 05-06 10:06:23 model_runner.py:172] Loading model weights took 8.4596 GB
 
 outputs = model.generate("Once upon a time,")

diff --git a/auto_fp8/__init__.py b/auto_fp8/__init__.py
@@ -0,0 +1,7 @@
+from .modeling import AutoFP8ForCausalLM
+from .config import BaseQuantizeConfig
+
+__all__ = [
+    "AutoFP8ForCausalLM",
+    "BaseQuantizeConfig",
+]
diff --git a/auto_fp8/config.py b/auto_fp8/config.py
@@ -0,0 +1,10 @@
+class BaseQuantizeConfig:
+    def __init__(self, quant_method="fp8", activation_scheme="static"):
+        if quant_method != "fp8":
+            raise ValueError("Only FP8 quantization is supported.")
+        if activation_scheme not in ["static", "dynamic"]:
+            raise ValueError(
+                "Invalid activation_scheme. Choose either 'static' or 'dynamic'."
+            )
+        self.quant_method = quant_method
+        self.activation_scheme = activation_scheme