Use RT field to distinguish annot groups (like strikeout & caret) fro…

…m actual replies
0xabu · Jan 1, 2025 · b07da56 · b07da56
1 parent b23213f
commit b07da56
Show file tree

Hide file tree

Showing 5 changed files with 77 additions and 37 deletions.
diff --git a/pdfannots/__init__.py b/pdfannots/__init__.py
@@ -106,9 +106,18 @@ def _mkannotation(
         createds = pdfminer.utils.decode_text(createds)
         created = decode_datetime(createds)
 
+    in_reply_to = pa.get('IRT')
+    is_group = False
+    if in_reply_to is not None:
+        reply_type = pa.get('RT')
+        if reply_type is PSLiteralTable.intern('Group'):
+            is_group = True
+        elif not (reply_type is None or reply_type is PSLiteralTable.intern('R')):
+            logger.warning("Unexpected RT=%s, treated as R", reply_type)
+
     return Annotation(page, annot_type, quadpoints=quadpoints, rect=rect, name=name,
                       contents=contents, author=author, created=created, color=rgb,
-                      in_reply_to_ref=pa.get('IRT'))
+                      in_reply_to_ref=in_reply_to, is_group_child=is_group)
 
 
 def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]:

diff --git a/pdfannots/printer/json.py b/pdfannots/printer/json.py
@@ -25,8 +25,7 @@ def annot_to_dict(
         "author": annot.author,
         "created": annot.created.strftime('%Y-%m-%dT%H:%M:%S') if annot.created else None,
         "color": ('#' + annot.color.ashex()) if annot.color else None,
-        "in_reply_to": (annot.in_reply_to.name if annot.in_reply_to and annot.in_reply_to.name
-                        else None),
+        "in_reply_to": annot.in_reply_to.name if annot.in_reply_to else None,
     }
 
     # Remove keys with None values in nested dictionary and return
@@ -62,5 +61,6 @@ def print_file(
         else:
             self.seen_first = True
 
-        annots = [annot_to_dict(document, a, self.remove_hyphens) for a in document.iter_annots()]
+        annots = [annot_to_dict(document, a, self.remove_hyphens)
+                  for a in document.iter_annots(include_replies=True)]
         yield from json.JSONEncoder(indent=2, ensure_ascii=self.ensure_ascii).iterencode(annots)
diff --git a/pdfannots/printer/markdown.py b/pdfannots/printer/markdown.py
@@ -223,11 +223,12 @@ def format_annot(
     ) -> str:
         # Limited support for Caret annotations with a single "reply" of type StrikeOut
         contents = annot.contents
-        if (annot.subtype == AnnotationType.Caret and annot.replies
-                and annot.replies[0].subtype == AnnotationType.StrikeOut):
-            annot = annot.replies[0]
-            if annot.contents:
-                logger.warning("Ignored StrikeOut comment: %s", annot.contents)
+        if annot.subtype == AnnotationType.Caret and annot.group_children:
+            child = annot.get_child_by_type(AnnotationType.StrikeOut)
+            if child:
+                annot = child
+                if child.contents:
+                    logger.warning("Ignored StrikeOut comment: %s", child.contents)
 
         # capture item text and contents (i.e. the comment), and split the latter into paragraphs
         text = annot.gettext(self.remove_hyphens) or ''
@@ -280,7 +281,7 @@ def emit_body(
         self,
         document: Document
     ) -> typ.Iterator[str]:
-        for a in document.iter_annots(include_replies=False):
+        for a in document.iter_annots():
             yield self.format_annot(a, document, a.subtype.name)
 
 
@@ -331,7 +332,7 @@ def fmt_header(name: str, level: int = 2) -> str:
         highlights: typ.List[Annotation] = []  # When grouping by color holds only undefined annots
         highlights_by_color: typ.DefaultDict[RGB, typ.List[Annotation]] = defaultdict(list)
 
-        for a in document.iter_annots(include_replies=False):
+        for a in document.iter_annots():
             if a.subtype in self.ANNOT_NITS:
                 nits.append(a)
             elif a.contents:
@@ -367,7 +368,7 @@ def fmt_header(name: str, level: int = 2) -> str:
                 for a in nits:
                     extra = None
                     if a.subtype == AnnotationType.Caret:
-                        if a.replies and a.replies[0].subtype == AnnotationType.StrikeOut:
+                        if a.get_child_by_type(AnnotationType.StrikeOut):
                             extra = "suggested replacement"
                         else:
                             extra = "suggested insertion"

diff --git a/pdfannots/types.py b/pdfannots/types.py
@@ -296,16 +296,18 @@ class Annotation(ObjectWithPos):
     A PDF annotation, and its extracted text.
 
     Attributes:
-        author       Author of the annotation
-        color        RGB color of the annotation
-        contents     Contents of the annotation in the PDF (e.g. comment/description)
-        created      Timestamp the annotation was created
-        in_reply_to  Reference to another annotation on the page that this is "in reply to"
-        last_charseq Sequence number of the most recent character in text
-        name         If present, uniquely identifies this annotation among others on the page
-        replies      Annotations replying to this one (reverse of in_reply_to)
-        subtype      PDF annotation type
-        text         Text in the order captured (use gettext() for a cleaner form)
+        author          Author of the annotation
+        color           RGB color of the annotation
+        contents        Contents of the annotation in the PDF (e.g. comment/description)
+        created         Timestamp the annotation was created
+        group_children  Annotations grouped together with this one
+        in_reply_to     Reference to another annotation on the page that this is "in reply to"
+        is_group_child  Is this annotation a member of a parent group?
+        last_charseq    Sequence number of the most recent character in text
+        name            If present, uniquely identifies this annotation among others on the page
+        replies         Annotations replying to this one (reverse of in_reply_to)
+        subtype         PDF annotation type
+        text            Text in the order captured (use gettext() for a cleaner form)
 
     Attributes updated for StrikeOut and Caret annotations:
         pre_context  Text captured just prior to the beginning of 'text'
@@ -314,6 +316,7 @@ class Annotation(ObjectWithPos):
 
     boxes: typ.List[Box]
     contents: typ.Optional[str]
+    group_children: typ.List[Annotation]
     in_reply_to: typ.Optional[Annotation]
     pre_context: typ.Optional[str]
     post_context: typ.Optional[str]
@@ -330,6 +333,7 @@ def __init__(
             color: typ.Optional[RGB] = None,
             contents: typ.Optional[str] = None,
             in_reply_to_ref: typ.Optional[PDFObjRef] = None,
+            is_group_child: bool = False,
             name: typ.Optional[str] = None,
             quadpoints: typ.Optional[typ.Sequence[float]] = None,
             rect: typ.Optional[BoxCoords] = None):
@@ -364,6 +368,7 @@ def __init__(
         self.color = color
         self.contents = contents if contents else None
         self.created = created
+        self.group_children = []
         self.name = name
         self.last_charseq = 0
         self.post_context = None
@@ -373,8 +378,11 @@ def __init__(
         self.text = []
 
         # The in_reply_to reference will be resolved in postprocess()
-        self._in_reply_to_ref = in_reply_to_ref
         self.in_reply_to = None
+        self._in_reply_to_ref = in_reply_to_ref
+        self.is_group_child = is_group_child
+        if is_group_child:
+            assert in_reply_to_ref
 
     def __repr__(self) -> str:
         return ('<Annotation %s %r%s%s>' %
@@ -402,6 +410,13 @@ def gettext(self, remove_hyphens: bool = False) -> typ.Optional[str]:
         else:
             return None
 
+    def get_child_by_type(self, child_type: AnnotationType) -> typ.Optional[Annotation]:
+        """Return the first child of the given type."""
+        for c in self.group_children:
+            if c.subtype == child_type:
+                return c
+        return None
+
     def wants_context(self) -> bool:
         """Returns true if this annotation type should include context."""
         return self.subtype in {AnnotationType.Caret, AnnotationType.StrikeOut}
@@ -440,9 +455,15 @@ def postprocess(self, annots_by_objid: typ.Dict[int, Annotation]) -> None:
         # Resole the in_reply_to object reference to its annotation
         if self._in_reply_to_ref is not None:
             assert self.in_reply_to is None  # This should be called once only
-            self.in_reply_to = annots_by_objid.get(self._in_reply_to_ref.objid)
-            if self.in_reply_to is not None:
-                self.in_reply_to.replies.append(self)
+            a = annots_by_objid.get(self._in_reply_to_ref.objid)
+            if a is None:
+                logger.warning("IRT reference (%d) not found in page annotations",
+                               self._in_reply_to_ref.objid)
+            elif self.is_group_child:
+                a.group_children.append(self)
+            else:
+                self.in_reply_to = a
+                a.replies.append(self)
 
         # The Skim PDF reader (https://skim-app.sourceforge.io/) creates annotations whose
         # default initial contents are a copy of the selected text. Unless the user goes to
@@ -514,11 +535,17 @@ class Document:
     def __init__(self) -> None:
         self.pages = []
 
-    def iter_annots(self, *, include_replies: bool = True) -> typ.Iterator[Annotation]:
-        """Iterate over all the annotations in the document."""
+    def iter_annots(self, *, include_replies: bool = False) -> typ.Iterator[Annotation]:
+        """
+        Iterate over all the annotations in the document.
+
+        Only the primary annotation for a group is included.
+        Replies are included only if include_replies is True.
+        """
+
         for p in self.pages:
             for a in p.annots:
-                if include_replies or not a.in_reply_to:
+                if not a.is_group_child and (include_replies or not a.in_reply_to):
                     yield a
 
     def nearest_outline(

diff --git a/tests.py b/tests.py
@@ -278,14 +278,17 @@ class CaretAnnotations(ExtractionTestBase):
 
     def test(self) -> None:
         self.assertEqual(len(self.annots), 5)
-        self.assertEqual(self.annots[0].subtype, AnnotationType.StrikeOut)
-        self.assertEqual(self.annots[0].gettext(), 'Adobe Acrobat Reader')
-        self.assertEqual(self.annots[3].subtype, AnnotationType.Caret)
-        self.assertEqual(self.annots[3].contents, 'Google Chrome')
-        self.assertEqual(self.annots[0].in_reply_to, self.annots[3])
-        self.assertEqual(self.annots[3].replies, [self.annots[0]])
-        self.assertEqual(self.annots[0].replies, [])
-        self.assertEqual(self.annots[3].in_reply_to, None)
+        a = self.annots[0]
+        self.assertEqual(a.subtype, AnnotationType.StrikeOut)
+        self.assertEqual(a.gettext(), 'Adobe Acrobat Reader')
+        self.assertTrue(a.is_group_child)
+        self.assertEqual(a.group_children, [])
+        g = self.annots[3]
+        self.assertEqual(g.subtype, AnnotationType.Caret)
+        self.assertEqual(g.contents, 'Google Chrome')
+        self.assertFalse(g.is_group_child)
+        self.assertEqual(g.group_children, [a])
+        self.assertEqual(g.get_child_by_type(AnnotationType.StrikeOut), a)
 
 
 class PrinterTestBase(unittest.TestCase):