Skip to content

Commit

Permalink
Improvements for rich text adaptation - repeated annotations
Browse files Browse the repository at this point in the history
When inline annotations are repeated, it was observed that the markup is always restored on first occurrence.

For example, if input content has a link and that repeats at end
of content, annotation will be restored for the first item, leaving the last.

Same is the case of references when references are repeated in the input
content.

To fix this, keep track of restored annotation and avoid duplicate restorations.

Test included.

Bug: T353791
Bug: T340956
Change-Id: I4842fb3123261e3dc8082170bb298eb956070e31
  • Loading branch information
santhoshtr committed Mar 20, 2024
1 parent bac08d2 commit 9274221
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 2 deletions.
14 changes: 14 additions & 0 deletions test/unit/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,16 @@
</p>\
"""

test_repeated_element = """\
<p><a href="./GNU">GNU</a> is an abbreviation of \
<a href="./GNU">GNU</a> is not <a href="./Unix">Unix</a></p>
"""

translated__repeated_element = """\
<p><a href="./Unix">Unix</a> not is <a href="./GNU">GNU</a> of \
abbreviation an is <a href="./GNU">GNU</a></p>
"""

tests = [
{"source": test_dodo_html, "translation": translated_dodo_html},
{"source": test_dodo_html, "translation": translated_dodo_html},
Expand All @@ -124,6 +134,10 @@
"source": test_references_ml,
"translation": translated_references_ml,
},
{
"source": test_repeated_element,
"translation": translated__repeated_element,
},
]


Expand Down
15 changes: 13 additions & 2 deletions translator/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ def __init__(self, config, source_lang, target_lang, model_name: str = None):
self.translatables: Dict[str, str] = {}
# Keep all paragraph text mapped to sentences in it.
self.paragraphs: Dict[str, List[str]] = {}
self.match_location_cache: Dict[str, int] = {}

def translate(self, html: str) -> str:
"""
Expand Down Expand Up @@ -219,6 +220,7 @@ def translate_node(self, doc: BeautifulSoup) -> None:
)
self.translatables = dict(zip(sentences, translated_sentences))
# Now apply the translation on same json object
self.match_location_cache = {} # Clear the cache
self.traverse(doc, mode="apply")

def traverse(self, doc: BeautifulSoup, mode="extract") -> None:
Expand Down Expand Up @@ -272,7 +274,11 @@ def traverse(self, doc: BeautifulSoup, mode="extract") -> None:
node_html = str(node)

# Locate node_text in doc_inner_content
# print("\t", doc.name, ">", node.name, ">", node_text)
if node_text in self.match_location_cache:
# This node_text was once located. So this is a repeated occurrence of same
# text somewhere else in the document.
# So start the search after previous location.
search_start = self.match_location_cache.get(node_text)
(match, translation_start, translation_end) = fuzzy_find(
doc_inner_content, node_text, search_start=search_start
)
Expand All @@ -292,7 +298,12 @@ def traverse(self, doc: BeautifulSoup, mode="extract") -> None:
doc_inner_content[translation_end:],
]
)
search_start = translation_start + len(node_html)

# Remember that this node_text was once found at a location.
# If this node_text is appearing again the document, don't apply html wrapping
# again since that will result invalid content like:
# `<b><b>abc</b></b> another abc`
self.match_location_cache[node_text] = translation_start + len(node_html)

doc.clear()
doc.insert(
Expand Down

0 comments on commit 9274221

Please sign in to comment.