Skip to content

Commit

Permalink
Use RT field to distinguish annot groups (like strikeout & caret) fro…
Browse files Browse the repository at this point in the history
…m actual replies
  • Loading branch information
0xabu committed Jan 1, 2025
1 parent b23213f commit b07da56
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 37 deletions.
11 changes: 10 additions & 1 deletion pdfannots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,18 @@ def _mkannotation(
createds = pdfminer.utils.decode_text(createds)
created = decode_datetime(createds)

in_reply_to = pa.get('IRT')
is_group = False
if in_reply_to is not None:
reply_type = pa.get('RT')
if reply_type is PSLiteralTable.intern('Group'):
is_group = True
elif not (reply_type is None or reply_type is PSLiteralTable.intern('R')):
logger.warning("Unexpected RT=%s, treated as R", reply_type)

return Annotation(page, annot_type, quadpoints=quadpoints, rect=rect, name=name,
contents=contents, author=author, created=created, color=rgb,
in_reply_to_ref=pa.get('IRT'))
in_reply_to_ref=in_reply_to, is_group_child=is_group)


def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]:
Expand Down
6 changes: 3 additions & 3 deletions pdfannots/printer/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ def annot_to_dict(
"author": annot.author,
"created": annot.created.strftime('%Y-%m-%dT%H:%M:%S') if annot.created else None,
"color": ('#' + annot.color.ashex()) if annot.color else None,
"in_reply_to": (annot.in_reply_to.name if annot.in_reply_to and annot.in_reply_to.name
else None),
"in_reply_to": annot.in_reply_to.name if annot.in_reply_to else None,
}

# Remove keys with None values in nested dictionary and return
Expand Down Expand Up @@ -62,5 +61,6 @@ def print_file(
else:
self.seen_first = True

annots = [annot_to_dict(document, a, self.remove_hyphens) for a in document.iter_annots()]
annots = [annot_to_dict(document, a, self.remove_hyphens)
for a in document.iter_annots(include_replies=True)]
yield from json.JSONEncoder(indent=2, ensure_ascii=self.ensure_ascii).iterencode(annots)
17 changes: 9 additions & 8 deletions pdfannots/printer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,12 @@ def format_annot(
) -> str:
# Limited support for Caret annotations with a single "reply" of type StrikeOut
contents = annot.contents
if (annot.subtype == AnnotationType.Caret and annot.replies
and annot.replies[0].subtype == AnnotationType.StrikeOut):
annot = annot.replies[0]
if annot.contents:
logger.warning("Ignored StrikeOut comment: %s", annot.contents)
if annot.subtype == AnnotationType.Caret and annot.group_children:
child = annot.get_child_by_type(AnnotationType.StrikeOut)
if child:
annot = child
if child.contents:
logger.warning("Ignored StrikeOut comment: %s", child.contents)

# capture item text and contents (i.e. the comment), and split the latter into paragraphs
text = annot.gettext(self.remove_hyphens) or ''
Expand Down Expand Up @@ -280,7 +281,7 @@ def emit_body(
self,
document: Document
) -> typ.Iterator[str]:
for a in document.iter_annots(include_replies=False):
for a in document.iter_annots():
yield self.format_annot(a, document, a.subtype.name)


Expand Down Expand Up @@ -331,7 +332,7 @@ def fmt_header(name: str, level: int = 2) -> str:
highlights: typ.List[Annotation] = [] # When grouping by color holds only undefined annots
highlights_by_color: typ.DefaultDict[RGB, typ.List[Annotation]] = defaultdict(list)

for a in document.iter_annots(include_replies=False):
for a in document.iter_annots():
if a.subtype in self.ANNOT_NITS:
nits.append(a)
elif a.contents:
Expand Down Expand Up @@ -367,7 +368,7 @@ def fmt_header(name: str, level: int = 2) -> str:
for a in nits:
extra = None
if a.subtype == AnnotationType.Caret:
if a.replies and a.replies[0].subtype == AnnotationType.StrikeOut:
if a.get_child_by_type(AnnotationType.StrikeOut):
extra = "suggested replacement"
else:
extra = "suggested insertion"
Expand Down
61 changes: 44 additions & 17 deletions pdfannots/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,16 +296,18 @@ class Annotation(ObjectWithPos):
A PDF annotation, and its extracted text.
Attributes:
author Author of the annotation
color RGB color of the annotation
contents Contents of the annotation in the PDF (e.g. comment/description)
created Timestamp the annotation was created
in_reply_to Reference to another annotation on the page that this is "in reply to"
last_charseq Sequence number of the most recent character in text
name If present, uniquely identifies this annotation among others on the page
replies Annotations replying to this one (reverse of in_reply_to)
subtype PDF annotation type
text Text in the order captured (use gettext() for a cleaner form)
author Author of the annotation
color RGB color of the annotation
contents Contents of the annotation in the PDF (e.g. comment/description)
created Timestamp the annotation was created
group_children Annotations grouped together with this one
in_reply_to Reference to another annotation on the page that this is "in reply to"
is_group_child Is this annotation a member of a parent group?
last_charseq Sequence number of the most recent character in text
name If present, uniquely identifies this annotation among others on the page
replies Annotations replying to this one (reverse of in_reply_to)
subtype PDF annotation type
text Text in the order captured (use gettext() for a cleaner form)
Attributes updated for StrikeOut and Caret annotations:
pre_context Text captured just prior to the beginning of 'text'
Expand All @@ -314,6 +316,7 @@ class Annotation(ObjectWithPos):

boxes: typ.List[Box]
contents: typ.Optional[str]
group_children: typ.List[Annotation]
in_reply_to: typ.Optional[Annotation]
pre_context: typ.Optional[str]
post_context: typ.Optional[str]
Expand All @@ -330,6 +333,7 @@ def __init__(
color: typ.Optional[RGB] = None,
contents: typ.Optional[str] = None,
in_reply_to_ref: typ.Optional[PDFObjRef] = None,
is_group_child: bool = False,
name: typ.Optional[str] = None,
quadpoints: typ.Optional[typ.Sequence[float]] = None,
rect: typ.Optional[BoxCoords] = None):
Expand Down Expand Up @@ -364,6 +368,7 @@ def __init__(
self.color = color
self.contents = contents if contents else None
self.created = created
self.group_children = []
self.name = name
self.last_charseq = 0
self.post_context = None
Expand All @@ -373,8 +378,11 @@ def __init__(
self.text = []

# The in_reply_to reference will be resolved in postprocess()
self._in_reply_to_ref = in_reply_to_ref
self.in_reply_to = None
self._in_reply_to_ref = in_reply_to_ref
self.is_group_child = is_group_child
if is_group_child:
assert in_reply_to_ref

def __repr__(self) -> str:
return ('<Annotation %s %r%s%s>' %
Expand Down Expand Up @@ -402,6 +410,13 @@ def gettext(self, remove_hyphens: bool = False) -> typ.Optional[str]:
else:
return None

def get_child_by_type(self, child_type: AnnotationType) -> typ.Optional[Annotation]:
"""Return the first child of the given type."""
for c in self.group_children:
if c.subtype == child_type:
return c
return None

def wants_context(self) -> bool:
"""Returns true if this annotation type should include context."""
return self.subtype in {AnnotationType.Caret, AnnotationType.StrikeOut}
Expand Down Expand Up @@ -440,9 +455,15 @@ def postprocess(self, annots_by_objid: typ.Dict[int, Annotation]) -> None:
# Resole the in_reply_to object reference to its annotation
if self._in_reply_to_ref is not None:
assert self.in_reply_to is None # This should be called once only
self.in_reply_to = annots_by_objid.get(self._in_reply_to_ref.objid)
if self.in_reply_to is not None:
self.in_reply_to.replies.append(self)
a = annots_by_objid.get(self._in_reply_to_ref.objid)
if a is None:
logger.warning("IRT reference (%d) not found in page annotations",
self._in_reply_to_ref.objid)
elif self.is_group_child:
a.group_children.append(self)
else:
self.in_reply_to = a
a.replies.append(self)

# The Skim PDF reader (https://skim-app.sourceforge.io/) creates annotations whose
# default initial contents are a copy of the selected text. Unless the user goes to
Expand Down Expand Up @@ -514,11 +535,17 @@ class Document:
def __init__(self) -> None:
self.pages = []

def iter_annots(self, *, include_replies: bool = True) -> typ.Iterator[Annotation]:
"""Iterate over all the annotations in the document."""
def iter_annots(self, *, include_replies: bool = False) -> typ.Iterator[Annotation]:
"""
Iterate over all the annotations in the document.
Only the primary annotation for a group is included.
Replies are included only if include_replies is True.
"""

for p in self.pages:
for a in p.annots:
if include_replies or not a.in_reply_to:
if not a.is_group_child and (include_replies or not a.in_reply_to):
yield a

def nearest_outline(
Expand Down
19 changes: 11 additions & 8 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,14 +278,17 @@ class CaretAnnotations(ExtractionTestBase):

def test(self) -> None:
self.assertEqual(len(self.annots), 5)
self.assertEqual(self.annots[0].subtype, AnnotationType.StrikeOut)
self.assertEqual(self.annots[0].gettext(), 'Adobe Acrobat Reader')
self.assertEqual(self.annots[3].subtype, AnnotationType.Caret)
self.assertEqual(self.annots[3].contents, 'Google Chrome')
self.assertEqual(self.annots[0].in_reply_to, self.annots[3])
self.assertEqual(self.annots[3].replies, [self.annots[0]])
self.assertEqual(self.annots[0].replies, [])
self.assertEqual(self.annots[3].in_reply_to, None)
a = self.annots[0]
self.assertEqual(a.subtype, AnnotationType.StrikeOut)
self.assertEqual(a.gettext(), 'Adobe Acrobat Reader')
self.assertTrue(a.is_group_child)
self.assertEqual(a.group_children, [])
g = self.annots[3]
self.assertEqual(g.subtype, AnnotationType.Caret)
self.assertEqual(g.contents, 'Google Chrome')
self.assertFalse(g.is_group_child)
self.assertEqual(g.group_children, [a])
self.assertEqual(g.get_child_by_type(AnnotationType.StrikeOut), a)


class PrinterTestBase(unittest.TestCase):
Expand Down

0 comments on commit b07da56

Please sign in to comment.