Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite packed file parser with finite state machines #16

Merged
merged 1 commit into from
Jan 15, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 72 additions & 28 deletions src/pkglite/unpack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from collections.abc import Sequence
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path

from .cli import (
Expand Down Expand Up @@ -68,9 +69,17 @@ def process_content_line(line: str) -> str:
return line[2:] if line.startswith(" ") else ""


class ParserState(Enum):
"""States for the packed file parser state machine."""

EXPECTING_PACKAGE = auto() # Initial state, expecting Package: line
EXPECTING_METADATA = auto() # Expecting File: or Format: or Content:
READING_CONTENT = auto() # Reading content lines


def parse_packed_file(input_file: str) -> Sequence[FileData]:
"""
Parse the packed text file and extract file data.
Parse the packed text file and extract file data using a finite state machine.

Args:
input_file: Path to the packed file.
Expand Down Expand Up @@ -104,43 +113,78 @@ def process_file_entry(
content=content["content"],
)

def handle_package_state(line: str) -> tuple[ParserState, dict[str, str] | None]:
"""Handle EXPECTING_PACKAGE state."""
if package_name := extract_metadata_field(line, "Package"):
return ParserState.EXPECTING_METADATA, {"package": package_name}
return ParserState.EXPECTING_PACKAGE, None

def handle_metadata_state(
line: str, current: dict[str, str]
) -> tuple[ParserState, bool]:
"""Handle EXPECTING_METADATA state. Returns (new_state, metadata_updated)."""
if path := extract_metadata_field(line, "File"):
current["path"] = path
return ParserState.EXPECTING_METADATA, True

if file_format := extract_metadata_field(line, "Format"):
current["format"] = file_format
return ParserState.EXPECTING_METADATA, True

if line == "Content:":
return ParserState.READING_CONTENT, True

return ParserState.EXPECTING_METADATA, False

def finalize_current_entry(
current: dict[str, str], content_lines: list[str]
) -> FileData | None:
"""Create FileData from current entry if valid."""
if file_data := process_file_entry(current, content_lines):
return file_data
return None

files: list[FileData] = []
current_file: dict[str, str] = {}
content_lines: list[str] = []
in_content = False
state = ParserState.EXPECTING_PACKAGE

with open(input_file, "r", encoding="utf-8") as f:
for line in f:
line = line.rstrip()

package_name = extract_metadata_field(line, "Package")
if package_name:
if state == ParserState.EXPECTING_PACKAGE:
# Handle transition from current package to new package
if current_file:
if file_data := process_file_entry(current_file, content_lines):
if file_data := finalize_current_entry(current_file, content_lines):
files.append(file_data)

new_state, new_file = handle_package_state(line)
if new_file:
current_file = new_file
content_lines = []
state = new_state

elif state == ParserState.EXPECTING_METADATA:
new_state, updated = handle_metadata_state(line, current_file)
if updated:
state = new_state

elif state == ParserState.READING_CONTENT:
if package_name := extract_metadata_field(line, "Package"):
# New package found while reading content
if file_data := finalize_current_entry(current_file, content_lines):
files.append(file_data)
current_file = {"package": package_name}
content_lines = []
in_content = False
continue

if not in_content:
path = extract_metadata_field(line, "File")
if path:
current_file["path"] = path
continue

file_format = extract_metadata_field(line, "Format")
if file_format:
current_file["format"] = file_format
continue

if line == "Content:":
in_content = True
continue
else:
content_lines.append(process_content_line(line))

if file_data := process_file_entry(current_file, content_lines):
current_file = {"package": package_name}
content_lines = []
state = ParserState.EXPECTING_METADATA
else:
content_lines.append(process_content_line(line))

# Handle the last file entry
if current_file and (
file_data := finalize_current_entry(current_file, content_lines)
):
files.append(file_data)

return files
Expand Down
Loading