Improvements for rich text adaptation - repeated annotations

When inline annotations are repeated, it was observed that the markup is always restored on first occurrence. For example, if input content has a link and that repeats at end of content, annotation will be restored for the first item, leaving the last. Same is the case of references when references are repeated in the input content. To fix this, keep track of restored annotation and avoid duplicate restorations. Test included. Bug: T353791 Bug: T340956 Change-Id: I4842fb3123261e3dc8082170bb298eb956070e31
wikimedia · Mar 20, 2024 · 9274221 · 9274221
1 parent bac08d2
commit 9274221
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 2 deletions.
diff --git a/test/unit/test_html.py b/test/unit/test_html.py
@@ -111,6 +111,16 @@
 </p>\
 """
 
+test_repeated_element = """\
+<p><a href="./GNU">GNU</a> is an abbreviation of \
+<a href="./GNU">GNU</a> is not <a href="./Unix">Unix</a></p>
+"""
+
+translated__repeated_element = """\
+<p><a href="./Unix">Unix</a> not is <a href="./GNU">GNU</a> of \
+abbreviation an is <a href="./GNU">GNU</a></p>
+"""
+
 tests = [
     {"source": test_dodo_html, "translation": translated_dodo_html},
     {"source": test_dodo_html, "translation": translated_dodo_html},
@@ -124,6 +134,10 @@
         "source": test_references_ml,
         "translation": translated_references_ml,
     },
+    {
+        "source": test_repeated_element,
+        "translation": translated__repeated_element,
+    },
 ]
 
 

diff --git a/translator/html.py b/translator/html.py
@@ -190,6 +190,7 @@ def __init__(self, config, source_lang, target_lang, model_name: str = None):
         self.translatables: Dict[str, str] = {}
         # Keep all paragraph text mapped to sentences in it.
         self.paragraphs: Dict[str, List[str]] = {}
+        self.match_location_cache: Dict[str, int] = {}
 
     def translate(self, html: str) -> str:
         """
@@ -219,6 +220,7 @@ def translate_node(self, doc: BeautifulSoup) -> None:
         )
         self.translatables = dict(zip(sentences, translated_sentences))
         # Now apply the translation on same json object
+        self.match_location_cache = {}  # Clear the cache
         self.traverse(doc, mode="apply")
 
     def traverse(self, doc: BeautifulSoup, mode="extract") -> None:
@@ -272,7 +274,11 @@ def traverse(self, doc: BeautifulSoup, mode="extract") -> None:
                     node_html = str(node)
 
                     # Locate node_text in doc_inner_content
-                    # print("\t", doc.name, ">", node.name, ">", node_text)
+                    if node_text in self.match_location_cache:
+                        # This node_text was once located. So this is a repeated occurrence of same
+                        # text somewhere else in the document.
+                        # So start the search after previous location.
+                        search_start = self.match_location_cache.get(node_text)
                     (match, translation_start, translation_end) = fuzzy_find(
                         doc_inner_content, node_text, search_start=search_start
                     )
@@ -292,7 +298,12 @@ def traverse(self, doc: BeautifulSoup, mode="extract") -> None:
                             doc_inner_content[translation_end:],
                         ]
                     )
-                    search_start = translation_start + len(node_html)
+
+                    # Remember that this node_text was once found at a location.
+                    # If this node_text is appearing again the document, don't apply html wrapping
+                    # again since that will result invalid content like:
+                    # `<b><b>abc</b></b> another abc`
+                    self.match_location_cache[node_text] = translation_start + len(node_html)
 
             doc.clear()
             doc.insert(