Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d983a6f
Working on ISO standard for IDocTags
PeterStaar-IBM Dec 11, 2025
4de44f1
fixed the pre-commit, still tons of work to do
PeterStaar-IBM Dec 11, 2025
6a4653d
updated idoctag token class
PeterStaar-IBM Dec 11, 2025
b7c641d
need to start fixing the bugs and add proper tests for idoctags
PeterStaar-IBM Dec 11, 2025
312ebe9
fixed the tests
PeterStaar-IBM Dec 12, 2025
0e6173c
refactored the testing for dcotags and idoctags
PeterStaar-IBM Dec 12, 2025
607441e
updating the location tokens in idoctags
PeterStaar-IBM Dec 12, 2025
77e9914
removed the DocumentToken
PeterStaar-IBM Dec 12, 2025
98ed500
fixed the idt quot issue
PeterStaar-IBM Dec 12, 2025
bac016a
reformatted
PeterStaar-IBM Dec 12, 2025
95b9219
Added IDocTagsSerializationMode
PeterStaar-IBM Dec 12, 2025
6d2c7f2
working on the deserializer
PeterStaar-IBM Dec 12, 2025
09c4f9a
fixed the captions for floating items
PeterStaar-IBM Dec 12, 2025
3c706e2
still need to fix some seserialization tests
PeterStaar-IBM Dec 12, 2025
75c92f8
finally all is working and reformatted
PeterStaar-IBM Dec 12, 2025
5b9a1dc
made the OTSL serialization and deserialization self-contained in IDo…
PeterStaar-IBM Dec 14, 2025
cc7f2a4
fixed some unclean code
PeterStaar-IBM Dec 14, 2025
ce5f1b7
did some fixes and clean up
PeterStaar-IBM Dec 16, 2025
0cf8fc3
added extra tests, expanded to deserialize to deal with nested lists
PeterStaar-IBM Dec 16, 2025
16b965f
added the deserialization for formatting
PeterStaar-IBM Dec 16, 2025
e132f04
fixed the formatting and the complex inline groups in nested lists
PeterStaar-IBM Dec 16, 2025
20bc4a0
added get_category
PeterStaar-IBM Dec 16, 2025
2e779a9
make IS_SELFCLOSING a set
PeterStaar-IBM Dec 16, 2025
ce45c76
removed the regex
PeterStaar-IBM Dec 16, 2025
b8e85bb
fixed the if ... return into if ... elif ... return
PeterStaar-IBM Dec 17, 2025
33954cf
fixed the facets parsing
PeterStaar-IBM Dec 17, 2025
3ee7dd1
fixed the footnotes for Tables and Pictures in IDocTags
PeterStaar-IBM Dec 17, 2025
f56e5bf
updated with latest doctags token table
PeterStaar-IBM Dec 17, 2025
4184ad3
removed private name
PeterStaar-IBM Dec 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,765 changes: 2,574 additions & 191 deletions docling_core/experimental/idoctags.py

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions docling_core/transforms/serializer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,15 @@ def serialize_captions(
"""Serialize the item's captions."""
...

@abstractmethod
def serialize_footnotes(
self,
item: FloatingItem,
**kwargs: Any,
) -> SerializationResult:
"""Serialize the item's footnotes."""
...

@deprecated("Use serialize_meta() instead.")
@abstractmethod
def serialize_annotations(
Expand Down
41 changes: 41 additions & 0 deletions docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,21 @@ def _captions_of_some_item(self) -> set[str]:
}
return refs

@computed_field # type: ignore[misc]
@cached_property
def _footnotes_of_some_item(self) -> set[str]:
layers = {cl for cl in ContentLayer} # TODO review
refs = {
ftn.cref
for (item, _) in self.doc.iterate_items(
with_groups=True,
traverse_pictures=True,
included_content_layers=layers,
)
for ftn in (item.footnotes if isinstance(item, FloatingItem) else [])
}
return refs

@override
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
"""References to excluded items."""
Expand Down Expand Up @@ -392,6 +407,9 @@ def serialize(
if my_item.self_ref in self._captions_of_some_item:
# those captions will be handled by the floating item holding them
return empty_res
elif my_item.self_ref in self._footnotes_of_some_item:
# those footnotes will be handled by the floating item holding them
return empty_res
else:
part = (
self.text_serializer.serialize(
Expand Down Expand Up @@ -589,6 +607,29 @@ def serialize_captions(
text_res = ""
return create_ser_result(text=text_res, span_source=results)

@override
def serialize_footnotes(
self,
item: FloatingItem,
**kwargs: Any,
) -> SerializationResult:
"""Serialize the item's footnotes."""
params = self.params.merge_with_patch(patch=kwargs)
results: list[SerializationResult] = []
if DocItemLabel.FOOTNOTE in params.labels:
results = [
create_ser_result(text=it.text, span_source=it)
for ftn in item.footnotes
if isinstance(it := ftn.resolve(self.doc), TextItem)
and it.self_ref not in self.get_excluded_refs(**kwargs)
]
# FIXME: using the caption_delimiter for now ...
text_res = params.caption_delim.join([r.text for r in results])
text_res = self.post_process(text=text_res)
else:
text_res = ""
return create_ser_result(text=text_res, span_source=results)

@override
def serialize_meta(
self,
Expand Down
Loading
Loading