1 files changed, 360 insertions, 0 deletions
diff --git a/drivers/tty/vt/gen_ucs_fallback_table.py b/drivers/tty/vt/gen_ucs_fallback_table.py
new file mode 100755
index 000000000000..6e09c1cb6d4b
--- /dev/null
+++ b/drivers/tty/vt/gen_ucs_fallback_table.py
@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# Leverage Python's unidecode module to generate ucs_fallback_table.h
+#
+# The generated table maps complex characters to their simpler fallback forms
+# for a terminal display when corresponding glyphs are unavailable.
+#
+# Usage:
+#   python3 gen_ucs_fallback_table.py         # Generate fallback tables
+#   python3 gen_ucs_fallback_table.py -o FILE # Specify output file
+
+import unicodedata
+from unidecode import unidecode
+import sys
+import argparse
+from collections import defaultdict
+
+# Try to get unidecode version
+try:
+    from importlib.metadata import version
+    unidecode_version = version('unidecode')
+except:
+    unidecode_version = 'unknown'
+
+# This script's file name
+from pathlib import Path
+this_file = Path(__file__).name
+
+# Default output file name
+DEFAULT_OUT_FILE = "ucs_fallback_table.h"
+
+# Define the range marker value
+RANGE_MARKER = 0x00
+
+def generate_fallback_map():
+    """Generate a fallback map using unidecode for all relevant Unicode points."""
+    fallback_map = {}
+
+    # Process BMP characters (0x0000 - 0xFFFF) to keep table size manageable
+    for cp in range(0x0080, 0x10000):  # Skip ASCII range (0x00-0x7F)
+        char = chr(cp)
+
+        # Skip unassigned/control characters
+        try:
+            if not unicodedata.name(char, ''):
+                continue
+        except ValueError:
+            continue
+
+        # Get the unidecode transliteration
+        ascii_version = unidecode(char)
+
+        # Only store if it results in a single character mapping
+        if len(ascii_version) == 1:
+            fallback_map[cp] = ord(ascii_version)
+
+    # Apply manual overrides for special cases
+    fallback_map.update(get_special_overrides())
+
+    return fallback_map
+
+def get_special_overrides():
+    """Get special case overrides that need different handling than unidecode
+    provides... or doesn't provide at all."""
+
+    overrides = {}
+
+    # Multi-character unidecode output
+    # These map to single chars instead of unidecode's multiple-char mappings
+    # In a terminal fallback context, we need a single character rather than multiple
+    overrides[0x00C6] = ord('E')  # Æ LATIN CAPITAL LETTER AE -> E (unidecode: "AE")
+    overrides[0x00E6] = ord('e')  # æ LATIN SMALL LETTER AE -> e (unidecode: "ae")
+    overrides[0x0152] = ord('E')  # Œ LATIN CAPITAL LIGATURE OE -> E (unidecode: "OE")
+    overrides[0x0153] = ord('e')  # œ LATIN SMALL LETTER LIGATURE OE -> e (unidecode: "oe")
+    overrides[0x00DF] = ord('s')  # ß LATIN SMALL LETTER SHARP S -> s (unidecode: "ss")
+
+    # Comparison operators that unidecode renders as multiple characters
+    overrides[0x2264] = ord('<')  # ≤ LESS-THAN OR EQUAL TO -> < (unidecode: "<=")
+    overrides[0x2265] = ord('>')  # ≥ GREATER-THAN OR EQUAL TO -> > (unidecode: ">=")
+
+    # Unidecode returns an empty string for these
+    overrides[0x2260] = ord('#')  # ≠ NOT EQUAL TO -> # (unidecode: empty string)
+
+    # Quadrant block characters that unidecode doesn't map
+    for cp in range(0x2596, 0x259F+1):
+        overrides[cp] = ord('#')  # ▖ ▗ ▘ ▙ etc. - map to # (unidecode: empty string)
+
+    # Directional arrows
+    # These provide better semantic meaning than unidecode's mappings
+    overrides[0x2192] = ord('>')  # → RIGHTWARDS ARROW -> > (unidecode: "-")
+    overrides[0x2190] = ord('<')  # ← LEFTWARDS ARROW -> < (unidecode: "-")
+    overrides[0x2191] = ord('^')  # ↑ UPWARDS ARROW -> ^ (unidecode: "|")
+    overrides[0x2193] = ord('v')  # ↓ DOWNWARDS ARROW -> v (unidecode: "|")
+
+    # Double arrows with their directional semantic mappings
+    overrides[0x21D0] = ord('<')  # ⇐ LEFTWARDS DOUBLE ARROW -> <
+    overrides[0x21D1] = ord('^')  # ⇑ UPWARDS DOUBLE ARROW -> ^
+    overrides[0x21D2] = ord('>')  # ⇒ RIGHTWARDS DOUBLE ARROW -> >
+    overrides[0x21D3] = ord('v')  # ⇓ DOWNWARDS DOUBLE ARROW -> v
+
+    # Halfwidth arrows
+    # These need the same treatment as their normal-width counterparts
+    overrides[0xFFE9] = ord('<')  # ￩ HALFWIDTH LEFTWARDS ARROW -> < (unidecode: "-")
+    overrides[0xFFEA] = ord('^')  # ￪ HALFWIDTH UPWARDS ARROW -> ^ (unidecode: "|")
+    overrides[0xFFEB] = ord('>')  # ￫ HALFWIDTH RIGHTWARDS ARROW -> > (unidecode: "-")
+    overrides[0xFFEC] = ord('v')  # ￬ HALFWIDTH DOWNWARDS ARROW -> v (unidecode: "|")
+
+    # Currency symbols - each mapped to a representative letter
+    overrides[0x00A2] = ord('c')  # ¢ CENT SIGN -> c
+    overrides[0x00A3] = ord('L')  # £ POUND SIGN -> L
+    overrides[0x00A5] = ord('Y')  # ¥ YEN SIGN -> Y
+    overrides[0x20AC] = ord('E')  # € EURO SIGN -> E
+
+    # Symbols mapped to letters
+    overrides[0x00A7] = ord('S')  # § SECTION SIGN -> S
+    overrides[0x00A9] = ord('C')  # © COPYRIGHT SIGN -> C
+    overrides[0x00AE] = ord('R')  # ® REGISTERED SIGN -> R
+    overrides[0x2122] = ord('T')  # ™ TRADE MARK SIGN -> T
+
+    # Degree-related symbols
+    overrides[0x00B0] = ord('o')  # ° DEGREE SIGN -> o
+    overrides[0x2103] = ord('C')  # ℃ DEGREE CELSIUS -> C
+    overrides[0x2109] = ord('F')  # ℉ DEGREE FAHRENHEIT -> F
+
+    # Angle quotation marks
+    overrides[0x00AB] = ord('<')  # « LEFT-POINTING DOUBLE ANGLE QUOTATION MARK -> <
+    overrides[0x00BB] = ord('>')  # » RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK -> >
+
+    # Operators with circular shape
+    overrides[0x2218] = ord('o')  # ∘ RING OPERATOR -> o
+    overrides[0x2219] = ord('.')  # ∙ BULLET OPERATOR -> .
+
+    # Negated mathematical symbols (preserving the negation semantics)
+    # Negated symbols mapped to exclamation mark (semantically "not")
+    for cp in (0x2204, 0x2209, 0x220C, 0x2224, 0x2226, 0x226E, 0x226F, 0x2280, 0x2281, 0x2284, 0x2285):
+        overrides[cp] = ord('!')  # Negated math symbols -> ! (not)
+
+    # Negated symbols mapped to hash sign (semantically "not equal")
+    for cp in (0x2241, 0x2244, 0x2249, 0x2262, 0x2268, 0x2269, 0x226D, 0x228A, 0x228B):
+        overrides[cp] = ord('#')  # Negated equality symbols -> # (not equal)
+
+    # Negated arrows - all mapped to exclamation mark
+    for cp in (0x219A, 0x219B, 0x21AE, 0x21CD, 0x21CE, 0x21CF):
+        overrides[cp] = ord('!')  # Negated arrows -> ! (not)
+
+    # Dashes and hyphens
+    for cp in (0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2043, 0x2052):
+        overrides[cp] = ord('-')  # Dashes and hyphens -> -
+
+    # Question mark punctuation
+    for cp in (0x203D, 0x2047, 0x2048):
+        overrides[cp] = ord('?')  # Question marks -> ?
+
+    # Exclamation mark punctuation
+    for cp in (0x203C, 0x2049):
+        overrides[cp] = ord('!')  # Exclamation marks -> !
+
+    # Asterisk-like symbols
+    for cp in (0x2042, 0x2051, 0x2055):
+        overrides[cp] = ord('*')
+
+    # Other specific punctuation with unique mappings
+    overrides[0x201E] = ord('"')  # „ DOUBLE LOW-9 QUOTATION MARK
+    overrides[0x2023] = ord('>')  # ‣ TRIANGULAR BULLET
+    overrides[0x2026] = ord('.')  # … HORIZONTAL ELLIPSIS
+    overrides[0x2033] = ord('"')  # ″ DOUBLE PRIME
+    overrides[0x204B] = ord('P')  # ⁋ REVERSED PILCROW SIGN
+    overrides[0x204C] = ord('<')  # ⁌ BLACK LEFTWARDS BULLET
+    overrides[0x204D] = ord('>')  # ⁍ BLACK RIGHTWARDS BULLET
+    overrides[0x204F] = ord(';')  # ⁏ REVERSED SEMICOLON
+    overrides[0x205B] = ord(':')  # ⁛ FOUR DOT MARK
+
+    # Check marks
+    overrides[0x2713] = ord('v')  # ✓ CHECK MARK
+    overrides[0x2714] = ord('V')  # ✔ HEAVY CHECK MARK
+
+    # X marks - lowercase for regular, uppercase for heavy
+    for cp in (0x2715, 0x2717):
+        overrides[cp] = ord('x')  # Regular X marks -> x
+    for cp in (0x2716, 0x2718):
+        overrides[cp] = ord('X')  # Heavy X marks -> X
+
+    # Stars and asterisk-like symbols mapped to '*'
+    for cp in (0x2605, 0x2606, 0x262A, 0x269D, 0x2698):
+        overrides[cp] = ord('*')  # All star and asterisk symbols -> *
+    for cp in range(0x2721, 0x2746+1):
+        overrides[cp] = ord('*')  # All star and asterisk symbols -> *
+    for cp in range(0x2749, 0x274B+1):
+        overrides[cp] = ord('*')  # Last set of asterisk symbols -> *
+    for cp in (0x229B, 0x22C6, 0x235F, 0x2363):
+        overrides[cp] = ord('*')  # Star operators -> *
+
+    # Special exclusions with fallback value of 0
+    # These will be filtered out in organize_by_pages()
+
+    # Exclude U+2028 (LINE SEPARATOR)
+    overrides[0x2028] = 0  # LINE SEPARATOR (unidecode: '\n')
+
+    # Full-width to ASCII mapping (covering all printable ASCII 33-126)
+    # 0xFF01 (！) to 0xFF5E (～) -> ASCII 33 (!) to 126 (~)
+    # Those are excluded here to reduce the table size.
+    # It is more efficient to process them programmatically in
+    # ucs.c:ucs_get_fallback().
+    for cp in range(0xFF01, 0xFF5E + 1):
+        overrides[cp] = 0  # Double-width ASCII characters
+
+    return overrides
+
+def organize_by_pages(fallback_map):
+    """Organize the fallback mappings by their high byte (page)."""
+    # Group by high byte (page)
+    page_groups = defaultdict(list)
+    for code, fallback in fallback_map.items():
+        # Skip characters with fallback value of 0 (excluded characters)
+        if fallback == 0:
+            continue
+
+        page = code >> 8  # Get the high byte (page)
+        offset = code & 0xFF  # Get the low byte (offset within page)
+        page_groups[page].append((offset, fallback))
+
+    # Sort each page's entries by offset
+    for page in page_groups:
+        page_groups[page].sort()
+
+    return page_groups
+
+def compress_ranges(page_groups):
+    """Compress consecutive entries with the same fallback character into ranges.
+    A range is only compressed if it contains 3 or more consecutive entries."""
+
+    compressed_pages = {}
+
+    for page, entries in page_groups.items():
+        compressed_entries = []
+        i = 0
+        while i < len(entries):
+            start_offset, fallback = entries[i]
+
+            # Look ahead to find consecutive entries with the same fallback
+            j = i + 1
+            while (j < len(entries) and
+                   entries[j][0] == entries[j-1][0] + 1 and  # consecutive offsets
+                   entries[j][1] == fallback):               # same fallback
+                j += 1
+
+            # Calculate the range end
+            end_offset = entries[j-1][0]
+
+            # If we found a range with 3 or more entries (worth compressing)
+            if j - i >= 3:
+                # Add a range entry
+                compressed_entries.append((start_offset, RANGE_MARKER))
+                compressed_entries.append((end_offset, fallback))
+            else:
+                # Add the individual entries as is
+                for k in range(i, j):
+                    compressed_entries.append(entries[k])
+
+            i = j
+
+        compressed_pages[page] = compressed_entries
+
+    return compressed_pages
+
+def cp_name(cp):
+    """Get the Unicode character name for a code point."""
+    try:
+        return unicodedata.name(chr(cp))
+    except:
+        return f"U+{cp:04X}"
+
+def generate_fallback_tables(out_file=DEFAULT_OUT_FILE):
+    """Generate the fallback character tables."""
+    # Generate fallback map using unidecode
+    fallback_map = generate_fallback_map()
+    print(f"Generated {len(fallback_map)} total fallback mappings")
+
+    # Organize by pages
+    page_groups = organize_by_pages(fallback_map)
+    print(f"Organized into {len(page_groups)} pages")
+
+    # Compress ranges
+    compressed_pages = compress_ranges(page_groups)
+    total_compressed_entries = sum(len(entries) for entries in compressed_pages.values())
+    print(f"Total compressed entries: {total_compressed_entries}")
+
+    # Create output file
+    with open(out_file, 'w') as f:
+        f.write(f"""\
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * {out_file} - Unicode character fallback table
+ *
+ * Auto-generated by {this_file}
+ *
+ * Unicode Version: {unicodedata.unidata_version}
+ * Unidecode Version: {unidecode_version}
+ *
+ * This file contains optimized tables that map complex Unicode characters
+ * to simpler fallback characters for terminal display when corresponding
+ * glyphs are unavailable.
+ */
+
+static const struct ucs_page_desc ucs_fallback_pages[] = {{
+""")
+
+        # Convert compressed_pages to a sorted list of (page, entries) tuples
+        sorted_pages = sorted(compressed_pages.items())
+
+        # Track the start index for each page
+        start_index = 0
+
+        # Write page descriptors
+        for page, entries in sorted_pages:
+            count = len(entries)
+            f.write(f"\t{{ 0x{page:02X}, {count}, {start_index} }},\n")
+            start_index += count
+
+        # Write entries array
+        f.write("""\
+};
+
+/* Page entries array (referenced by page descriptors) */
+static const struct ucs_page_entry ucs_fallback_entries[] = {
+""")
+
+        # Write all entries
+        for page, entries in sorted_pages:
+            page_hex = f"0x{page:02X}"
+            f.write(f"\t/* Entries for page {page_hex} */\n")
+
+            for i, (offset, fallback) in enumerate(entries):
+                # Convert to hex for better readability
+                offset_hex = f"0x{offset:02X}"
+                fallback_hex = f"0x{fallback:02X}"
+
+                # Handle comments
+                codepoint = (page << 8) | offset
+
+                if fallback == RANGE_MARKER:
+                    comment = f"{cp_name(codepoint)} -> ..."
+                else:
+                    comment = f"{cp_name(codepoint)} -> '{chr(fallback)}'"
+                f.write(f"\t{{ 0x{offset:02X}, 0x{fallback:02X} }}, /* {comment} */\n")
+
+        f.write(f"""\
+}};
+
+#define UCS_PAGE_ENTRY_RANGE_MARKER {RANGE_MARKER}
+""")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate Unicode fallback character tables")
+    parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE,
+                       help=f"Output file name (default: {DEFAULT_OUT_FILE})")
+    args = parser.parse_args()
+
+    generate_fallback_tables(out_file=args.output_file)