drivers/tty/vt/gen_ucs_width_table.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307

#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0
#
# Leverage Python's unicodedata module to generate ucs_width_table.h

import unicodedata
import sys
import argparse

# This script's file name
from pathlib import Path
this_file = Path(__file__).name

# Default output file name
DEFAULT_OUT_FILE = "ucs_width_table.h"

# --- Global Constants for Width Assignments ---

# Known zero-width characters
KNOWN_ZERO_WIDTH = (
    0x200B,  # ZERO WIDTH SPACE
    0x200C,  # ZERO WIDTH NON-JOINER
    0x200D,  # ZERO WIDTH JOINER
    0x2060,  # WORD JOINER
    0xFEFF   # ZERO WIDTH NO-BREAK SPACE (BOM)
)

# Zero-width emoji modifiers and components
# NOTE: Some of these characters would normally be single-width according to
# East Asian Width properties, but we deliberately override them to be
# zero-width because they function as modifiers in emoji sequences.
EMOJI_ZERO_WIDTH = [
    # Skin tone modifiers
    (0x1F3FB, 0x1F3FF),  # Emoji modifiers (skin tones)

    # Variation selectors (note: VS16 is treated specially in vt.c)
    (0xFE00, 0xFE0F),    # Variation Selectors 1-16

    # Gender and hair style modifiers
    # These would be single-width by Unicode properties, but are zero-width
    # when part of emoji
    (0x2640, 0x2640),    # Female sign
    (0x2642, 0x2642),    # Male sign
    (0x26A7, 0x26A7),    # Transgender symbol
    (0x1F9B0, 0x1F9B3),  # Hair components (red, curly, white, bald)

    # Tag characters
    (0xE0020, 0xE007E),  # Tags
]

# Regional indicators (flag components)
REGIONAL_INDICATORS = (0x1F1E6, 0x1F1FF)  # Regional indicator symbols A-Z

# Double-width emoji ranges
#
# Many emoji characters are classified as single-width according to Unicode
# Standard Annex #11 East Asian Width property (N or Neutral), but we
# deliberately override them to be double-width. References:
# 1. Unicode Technical Standard #51: Unicode Emoji
#    (https://www.unicode.org/reports/tr51/)
# 2. Principle of "emoji presentation" in WHATWG CSS Text specification
#    (https://drafts.csswg.org/css-text-3/#character-properties)
# 3. Terminal emulator implementations (iTerm2, Windows Terminal, etc.) which
#    universally render emoji as double-width characters regardless of their
#    Unicode EAW property
# 4. W3C Work Item: Requirements for Japanese Text Layout - Section 3.8.1
#    Emoji width (https://www.w3.org/TR/jlreq/)
EMOJI_RANGES = [
    (0x1F000, 0x1F02F),  # Mahjong Tiles (EAW: N, but displayed as double-width)
    (0x1F0A0, 0x1F0FF),  # Playing Cards (EAW: N, but displayed as double-width)
    (0x1F300, 0x1F5FF),  # Miscellaneous Symbols and Pictographs
    (0x1F600, 0x1F64F),  # Emoticons
    (0x1F680, 0x1F6FF),  # Transport and Map Symbols
    (0x1F700, 0x1F77F),  # Alchemical Symbols
    (0x1F780, 0x1F7FF),  # Geometric Shapes Extended
    (0x1F800, 0x1F8FF),  # Supplemental Arrows-C
    (0x1F900, 0x1F9FF),  # Supplemental Symbols and Pictographs
    (0x1FA00, 0x1FA6F),  # Chess Symbols
    (0x1FA70, 0x1FAFF),  # Symbols and Pictographs Extended-A
]

def create_width_tables():
    """
    Creates Unicode character width tables and returns the data structures.

    Returns:
        tuple: (zero_width_ranges, double_width_ranges)
    """

    # Width data mapping
    width_map = {}  # Maps code points to width (0, 1, 2)

    # Mark emoji modifiers as zero-width
    for start, end in EMOJI_ZERO_WIDTH:
        for cp in range(start, end + 1):
            width_map[cp] = 0

    # Mark all regional indicators as single-width as they are usually paired
    # providing a combined width of 2 when displayed together.
    start, end = REGIONAL_INDICATORS
    for cp in range(start, end + 1):
        width_map[cp] = 1

    # Process all assigned Unicode code points (Basic Multilingual Plane +
    # Supplementary Planes) Range 0x0 to 0x10FFFF (the full Unicode range)
    for block_start in range(0, 0x110000, 0x1000):
        block_end = block_start + 0x1000
        for cp in range(block_start, block_end):
            try:
                char = chr(cp)

                # Skip if already processed
                if cp in width_map:
                    continue

                # Check for combining marks and a format characters
                category = unicodedata.category(char)

                # Combining marks
                if category.startswith('M'):
                    width_map[cp] = 0
                    continue

                # Format characters
                # Since we have no support for bidirectional text, all format
                # characters (category Cf) can be treated with width 0 (zero)
                # for simplicity, as they don't need to occupy visual space
                # in a non-bidirectional text environment.
                if category == 'Cf':
                    width_map[cp] = 0
                    continue

                # Known zero-width characters
                if cp in KNOWN_ZERO_WIDTH:
                    width_map[cp] = 0
                    continue

                # Use East Asian Width property
                eaw = unicodedata.east_asian_width(char)
                if eaw in ('F', 'W'):  # Fullwidth or Wide
                    width_map[cp] = 2
                elif eaw in ('Na', 'H', 'N', 'A'):  # Narrow, Halfwidth, Neutral, Ambiguous
                    width_map[cp] = 1
                else:
                    # Default to single-width for unknown
                    width_map[cp] = 1

            except (ValueError, OverflowError):
                # Skip invalid code points
                continue

    # Process Emoji - generally double-width
    for start, end in EMOJI_RANGES:
        for cp in range(start, end + 1):
            if cp not in width_map or width_map[cp] != 0:  # Don't override zero-width
                try:
                    char = chr(cp)
                    width_map[cp] = 2
                except (ValueError, OverflowError):
                    continue

    # Optimize to create range tables
    def ranges_optimize(width_data, target_width):
        points = sorted([cp for cp, width in width_data.items() if width == target_width])
        if not points:
            return []

        # Group consecutive code points into ranges
        ranges = []
        start = points[0]
        prev = start

        for cp in points[1:]:
            if cp > prev + 1:
                ranges.append((start, prev))
                start = cp
            prev = cp

        # Add the last range
        ranges.append((start, prev))
        return ranges

    # Extract ranges for each width
    zero_width_ranges = ranges_optimize(width_map, 0)
    double_width_ranges = ranges_optimize(width_map, 2)

    return zero_width_ranges, double_width_ranges

def write_tables(zero_width_ranges, double_width_ranges, out_file=DEFAULT_OUT_FILE):
    """
    Write the generated tables to C header file.

    Args:
        zero_width_ranges: List of (start, end) ranges for zero-width characters
        double_width_ranges: List of (start, end) ranges for double-width characters
        out_file: Output file name (default: DEFAULT_OUT_FILE)
    """

    # Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit)
    def split_ranges_by_size(ranges):
        bmp_ranges = []
        non_bmp_ranges = []

        for start, end in ranges:
            if end <= 0xFFFF:
                bmp_ranges.append((start, end))
            elif start > 0xFFFF:
                non_bmp_ranges.append((start, end))
            else:
                # Split the range at 0xFFFF
                bmp_ranges.append((start, 0xFFFF))
                non_bmp_ranges.append((0x10000, end))

        return bmp_ranges, non_bmp_ranges

    # Split ranges into BMP and non-BMP
    zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges)
    double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges)

    # Function to generate code point description comments
    def get_code_point_comment(start, end):
        try:
            start_char_desc = unicodedata.name(chr(start))
            if start == end:
                return f"/* {start_char_desc} */"
            else:
                end_char_desc = unicodedata.name(chr(end))
                return f"/* {start_char_desc} - {end_char_desc} */"
        except:
            if start == end:
                return f"/* U+{start:04X} */"
            else:
                return f"/* U+{start:04X} - U+{end:04X} */"

    # Generate C tables
    with open(out_file, 'w') as f:
        f.write(f"""\
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * {out_file} - Unicode character width
 *
 * Auto-generated by {this_file}
 *
 * Unicode Version: {unicodedata.unidata_version}
 */

/* Zero-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
static const struct ucs_interval16 ucs_zero_width_bmp_ranges[] = {{
""")

        for start, end in zero_width_bmp:
            comment = get_code_point_comment(start, end)
            f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")

        f.write("""\
};

/* Zero-width character ranges (non-BMP, U+10000 and above) */
static const struct ucs_interval32 ucs_zero_width_non_bmp_ranges[] = {
""")

        for start, end in zero_width_non_bmp:
            comment = get_code_point_comment(start, end)
            f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")

        f.write("""\
};

/* Double-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
static const struct ucs_interval16 ucs_double_width_bmp_ranges[] = {
""")

        for start, end in double_width_bmp:
            comment = get_code_point_comment(start, end)
            f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")

        f.write("""\
};

/* Double-width character ranges (non-BMP, U+10000 and above) */
static const struct ucs_interval32 ucs_double_width_non_bmp_ranges[] = {
""")

        for start, end in double_width_non_bmp:
            comment = get_code_point_comment(start, end)
            f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")

        f.write("};\n")

if __name__ == "__main__":
    # Parse command line arguments
    parser = argparse.ArgumentParser(description="Generate Unicode width tables")
    parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE,
                        help=f"Output file name (default: {DEFAULT_OUT_FILE})")
    args = parser.parse_args()

    # Write tables to header file
    zero_width_ranges, double_width_ranges = create_width_tables()
    write_tables(zero_width_ranges, double_width_ranges, out_file=args.output_file)

    # Print summary
    zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges)
    double_width_count = sum(end - start + 1 for start, end in double_width_ranges)
    print(f"Generated {args.output_file} with:")
    print(f"- {len(zero_width_ranges)} zero-width ranges covering ~{zero_width_count} code points")
    print(f"- {len(double_width_ranges)} double-width ranges covering ~{double_width_count} code points")
    print(f"- Unicode Version: {unicodedata.unidata_version}")