1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
|
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0
#
# Leverage Python's unidecode module to generate ucs_fallback_table.h
#
# The generated table maps complex characters to their simpler fallback forms
# for a terminal display when corresponding glyphs are unavailable.
#
# Usage:
# python3 gen_ucs_fallback_table.py # Generate fallback tables
# python3 gen_ucs_fallback_table.py -o FILE # Specify output file
import unicodedata
from unidecode import unidecode
import sys
import argparse
from collections import defaultdict
# Try to get unidecode version
try:
from importlib.metadata import version
unidecode_version = version('unidecode')
except:
unidecode_version = 'unknown'
# This script's file name
from pathlib import Path
this_file = Path(__file__).name
# Default output file name
DEFAULT_OUT_FILE = "ucs_fallback_table.h"
# Define the range marker value
RANGE_MARKER = 0x00
def generate_fallback_map():
"""Generate a fallback map using unidecode for all relevant Unicode points."""
fallback_map = {}
# Process BMP characters (0x0000 - 0xFFFF) to keep table size manageable
for cp in range(0x0080, 0x10000): # Skip ASCII range (0x00-0x7F)
char = chr(cp)
# Skip unassigned/control characters
try:
if not unicodedata.name(char, ''):
continue
except ValueError:
continue
# Get the unidecode transliteration
ascii_version = unidecode(char)
# Only store if it results in a single character mapping
if len(ascii_version) == 1:
fallback_map[cp] = ord(ascii_version)
# Apply manual overrides for special cases
fallback_map.update(get_special_overrides())
return fallback_map
def get_special_overrides():
"""Get special case overrides that need different handling than unidecode
provides... or doesn't provide at all."""
overrides = {}
# Multi-character unidecode output
# These map to single chars instead of unidecode's multiple-char mappings
# In a terminal fallback context, we need a single character rather than multiple
overrides[0x00C6] = ord('E') # Æ LATIN CAPITAL LETTER AE -> E (unidecode: "AE")
overrides[0x00E6] = ord('e') # æ LATIN SMALL LETTER AE -> e (unidecode: "ae")
overrides[0x0152] = ord('E') # Œ LATIN CAPITAL LIGATURE OE -> E (unidecode: "OE")
overrides[0x0153] = ord('e') # œ LATIN SMALL LETTER LIGATURE OE -> e (unidecode: "oe")
overrides[0x00DF] = ord('s') # ß LATIN SMALL LETTER SHARP S -> s (unidecode: "ss")
# Comparison operators that unidecode renders as multiple characters
overrides[0x2264] = ord('<') # ≤ LESS-THAN OR EQUAL TO -> < (unidecode: "<=")
overrides[0x2265] = ord('>') # ≥ GREATER-THAN OR EQUAL TO -> > (unidecode: ">=")
# Unidecode returns an empty string for these
overrides[0x2260] = ord('#') # ≠ NOT EQUAL TO -> # (unidecode: empty string)
# Quadrant block characters that unidecode doesn't map
for cp in range(0x2596, 0x259F+1):
overrides[cp] = ord('#') # ▖ ▗ ▘ ▙ etc. - map to # (unidecode: empty string)
# Directional arrows
# These provide better semantic meaning than unidecode's mappings
overrides[0x2192] = ord('>') # → RIGHTWARDS ARROW -> > (unidecode: "-")
overrides[0x2190] = ord('<') # ← LEFTWARDS ARROW -> < (unidecode: "-")
overrides[0x2191] = ord('^') # ↑ UPWARDS ARROW -> ^ (unidecode: "|")
overrides[0x2193] = ord('v') # ↓ DOWNWARDS ARROW -> v (unidecode: "|")
# Double arrows with their directional semantic mappings
overrides[0x21D0] = ord('<') # ⇐ LEFTWARDS DOUBLE ARROW -> <
overrides[0x21D1] = ord('^') # ⇑ UPWARDS DOUBLE ARROW -> ^
overrides[0x21D2] = ord('>') # ⇒ RIGHTWARDS DOUBLE ARROW -> >
overrides[0x21D3] = ord('v') # ⇓ DOWNWARDS DOUBLE ARROW -> v
# Halfwidth arrows
# These need the same treatment as their normal-width counterparts
overrides[0xFFE9] = ord('<') # ← HALFWIDTH LEFTWARDS ARROW -> < (unidecode: "-")
overrides[0xFFEA] = ord('^') # ↑ HALFWIDTH UPWARDS ARROW -> ^ (unidecode: "|")
overrides[0xFFEB] = ord('>') # → HALFWIDTH RIGHTWARDS ARROW -> > (unidecode: "-")
overrides[0xFFEC] = ord('v') # ↓ HALFWIDTH DOWNWARDS ARROW -> v (unidecode: "|")
# Currency symbols - each mapped to a representative letter
overrides[0x00A2] = ord('c') # ¢ CENT SIGN -> c
overrides[0x00A3] = ord('L') # £ POUND SIGN -> L
overrides[0x00A5] = ord('Y') # ¥ YEN SIGN -> Y
overrides[0x20AC] = ord('E') # € EURO SIGN -> E
# Symbols mapped to letters
overrides[0x00A7] = ord('S') # § SECTION SIGN -> S
overrides[0x00A9] = ord('C') # © COPYRIGHT SIGN -> C
overrides[0x00AE] = ord('R') # ® REGISTERED SIGN -> R
overrides[0x2122] = ord('T') # ™ TRADE MARK SIGN -> T
# Degree-related symbols
overrides[0x00B0] = ord('o') # ° DEGREE SIGN -> o
overrides[0x2103] = ord('C') # ℃ DEGREE CELSIUS -> C
overrides[0x2109] = ord('F') # ℉ DEGREE FAHRENHEIT -> F
# Angle quotation marks
overrides[0x00AB] = ord('<') # « LEFT-POINTING DOUBLE ANGLE QUOTATION MARK -> <
overrides[0x00BB] = ord('>') # » RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK -> >
# Operators with circular shape
overrides[0x2218] = ord('o') # ∘ RING OPERATOR -> o
overrides[0x2219] = ord('.') # ∙ BULLET OPERATOR -> .
# Negated mathematical symbols (preserving the negation semantics)
# Negated symbols mapped to exclamation mark (semantically "not")
for cp in (0x2204, 0x2209, 0x220C, 0x2224, 0x2226, 0x226E, 0x226F, 0x2280, 0x2281, 0x2284, 0x2285):
overrides[cp] = ord('!') # Negated math symbols -> ! (not)
# Negated symbols mapped to hash sign (semantically "not equal")
for cp in (0x2241, 0x2244, 0x2249, 0x2262, 0x2268, 0x2269, 0x226D, 0x228A, 0x228B):
overrides[cp] = ord('#') # Negated equality symbols -> # (not equal)
# Negated arrows - all mapped to exclamation mark
for cp in (0x219A, 0x219B, 0x21AE, 0x21CD, 0x21CE, 0x21CF):
overrides[cp] = ord('!') # Negated arrows -> ! (not)
# Dashes and hyphens
for cp in (0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2043, 0x2052):
overrides[cp] = ord('-') # Dashes and hyphens -> -
# Question mark punctuation
for cp in (0x203D, 0x2047, 0x2048):
overrides[cp] = ord('?') # Question marks -> ?
# Exclamation mark punctuation
for cp in (0x203C, 0x2049):
overrides[cp] = ord('!') # Exclamation marks -> !
# Asterisk-like symbols
for cp in (0x2042, 0x2051, 0x2055):
overrides[cp] = ord('*')
# Other specific punctuation with unique mappings
overrides[0x201E] = ord('"') # „ DOUBLE LOW-9 QUOTATION MARK
overrides[0x2023] = ord('>') # ‣ TRIANGULAR BULLET
overrides[0x2026] = ord('.') # … HORIZONTAL ELLIPSIS
overrides[0x2033] = ord('"') # ″ DOUBLE PRIME
overrides[0x204B] = ord('P') # ⁋ REVERSED PILCROW SIGN
overrides[0x204C] = ord('<') # ⁌ BLACK LEFTWARDS BULLET
overrides[0x204D] = ord('>') # ⁍ BLACK RIGHTWARDS BULLET
overrides[0x204F] = ord(';') # ⁏ REVERSED SEMICOLON
overrides[0x205B] = ord(':') # ⁛ FOUR DOT MARK
# Check marks
overrides[0x2713] = ord('v') # ✓ CHECK MARK
overrides[0x2714] = ord('V') # ✔ HEAVY CHECK MARK
# X marks - lowercase for regular, uppercase for heavy
for cp in (0x2715, 0x2717):
overrides[cp] = ord('x') # Regular X marks -> x
for cp in (0x2716, 0x2718):
overrides[cp] = ord('X') # Heavy X marks -> X
# Stars and asterisk-like symbols mapped to '*'
for cp in (0x2605, 0x2606, 0x262A, 0x269D, 0x2698):
overrides[cp] = ord('*') # All star and asterisk symbols -> *
for cp in range(0x2721, 0x2746+1):
overrides[cp] = ord('*') # All star and asterisk symbols -> *
for cp in range(0x2749, 0x274B+1):
overrides[cp] = ord('*') # Last set of asterisk symbols -> *
for cp in (0x229B, 0x22C6, 0x235F, 0x2363):
overrides[cp] = ord('*') # Star operators -> *
# Special exclusions with fallback value of 0
# These will be filtered out in organize_by_pages()
# Exclude U+2028 (LINE SEPARATOR)
overrides[0x2028] = 0 # LINE SEPARATOR (unidecode: '\n')
# Full-width to ASCII mapping (covering all printable ASCII 33-126)
# 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~)
# Those are excluded here to reduce the table size.
# It is more efficient to process them programmatically in
# ucs.c:ucs_get_fallback().
for cp in range(0xFF01, 0xFF5E + 1):
overrides[cp] = 0 # Double-width ASCII characters
return overrides
def organize_by_pages(fallback_map):
"""Organize the fallback mappings by their high byte (page)."""
# Group by high byte (page)
page_groups = defaultdict(list)
for code, fallback in fallback_map.items():
# Skip characters with fallback value of 0 (excluded characters)
if fallback == 0:
continue
page = code >> 8 # Get the high byte (page)
offset = code & 0xFF # Get the low byte (offset within page)
page_groups[page].append((offset, fallback))
# Sort each page's entries by offset
for page in page_groups:
page_groups[page].sort()
return page_groups
def compress_ranges(page_groups):
"""Compress consecutive entries with the same fallback character into ranges.
A range is only compressed if it contains 3 or more consecutive entries."""
compressed_pages = {}
for page, entries in page_groups.items():
compressed_entries = []
i = 0
while i < len(entries):
start_offset, fallback = entries[i]
# Look ahead to find consecutive entries with the same fallback
j = i + 1
while (j < len(entries) and
entries[j][0] == entries[j-1][0] + 1 and # consecutive offsets
entries[j][1] == fallback): # same fallback
j += 1
# Calculate the range end
end_offset = entries[j-1][0]
# If we found a range with 3 or more entries (worth compressing)
if j - i >= 3:
# Add a range entry
compressed_entries.append((start_offset, RANGE_MARKER))
compressed_entries.append((end_offset, fallback))
else:
# Add the individual entries as is
for k in range(i, j):
compressed_entries.append(entries[k])
i = j
compressed_pages[page] = compressed_entries
return compressed_pages
def cp_name(cp):
"""Get the Unicode character name for a code point."""
try:
return unicodedata.name(chr(cp))
except:
return f"U+{cp:04X}"
def generate_fallback_tables(out_file=DEFAULT_OUT_FILE):
"""Generate the fallback character tables."""
# Generate fallback map using unidecode
fallback_map = generate_fallback_map()
print(f"Generated {len(fallback_map)} total fallback mappings")
# Organize by pages
page_groups = organize_by_pages(fallback_map)
print(f"Organized into {len(page_groups)} pages")
# Compress ranges
compressed_pages = compress_ranges(page_groups)
total_compressed_entries = sum(len(entries) for entries in compressed_pages.values())
print(f"Total compressed entries: {total_compressed_entries}")
# Create output file
with open(out_file, 'w') as f:
f.write(f"""\
/* SPDX-License-Identifier: GPL-2.0 */
/*
* {out_file} - Unicode character fallback table
*
* Auto-generated by {this_file}
*
* Unicode Version: {unicodedata.unidata_version}
* Unidecode Version: {unidecode_version}
*
* This file contains optimized tables that map complex Unicode characters
* to simpler fallback characters for terminal display when corresponding
* glyphs are unavailable.
*/
static const struct ucs_page_desc ucs_fallback_pages[] = {{
""")
# Convert compressed_pages to a sorted list of (page, entries) tuples
sorted_pages = sorted(compressed_pages.items())
# Track the start index for each page
start_index = 0
# Write page descriptors
for page, entries in sorted_pages:
count = len(entries)
f.write(f"\t{{ 0x{page:02X}, {count}, {start_index} }},\n")
start_index += count
# Write entries array
f.write("""\
};
/* Page entries array (referenced by page descriptors) */
static const struct ucs_page_entry ucs_fallback_entries[] = {
""")
# Write all entries
for page, entries in sorted_pages:
page_hex = f"0x{page:02X}"
f.write(f"\t/* Entries for page {page_hex} */\n")
for i, (offset, fallback) in enumerate(entries):
# Convert to hex for better readability
offset_hex = f"0x{offset:02X}"
fallback_hex = f"0x{fallback:02X}"
# Handle comments
codepoint = (page << 8) | offset
if fallback == RANGE_MARKER:
comment = f"{cp_name(codepoint)} -> ..."
else:
comment = f"{cp_name(codepoint)} -> '{chr(fallback)}'"
f.write(f"\t{{ 0x{offset:02X}, 0x{fallback:02X} }}, /* {comment} */\n")
f.write(f"""\
}};
#define UCS_PAGE_ENTRY_RANGE_MARKER {RANGE_MARKER}
""")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate Unicode fallback character tables")
parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE,
help=f"Output file name (default: {DEFAULT_OUT_FILE})")
args = parser.parse_args()
generate_fallback_tables(out_file=args.output_file)
|