#!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 # # Leverage Python's unicodedata module to generate ucs_width_table.h import unicodedata import sys import argparse # This script's file name from pathlib import Path this_file = Path(__file__).name # Default output file name DEFAULT_OUT_FILE = "ucs_width_table.h" # --- Global Constants for Width Assignments --- # Known zero-width characters KNOWN_ZERO_WIDTH = ( 0x200B, # ZERO WIDTH SPACE 0x200C, # ZERO WIDTH NON-JOINER 0x200D, # ZERO WIDTH JOINER 0x2060, # WORD JOINER 0xFEFF # ZERO WIDTH NO-BREAK SPACE (BOM) ) # Zero-width emoji modifiers and components # NOTE: Some of these characters would normally be single-width according to # East Asian Width properties, but we deliberately override them to be # zero-width because they function as modifiers in emoji sequences. EMOJI_ZERO_WIDTH = [ # Skin tone modifiers (0x1F3FB, 0x1F3FF), # Emoji modifiers (skin tones) # Variation selectors (note: VS16 is treated specially in vt.c) (0xFE00, 0xFE0F), # Variation Selectors 1-16 # Gender and hair style modifiers # These would be single-width by Unicode properties, but are zero-width # when part of emoji (0x2640, 0x2640), # Female sign (0x2642, 0x2642), # Male sign (0x26A7, 0x26A7), # Transgender symbol (0x1F9B0, 0x1F9B3), # Hair components (red, curly, white, bald) # Tag characters (0xE0020, 0xE007E), # Tags ] # Regional indicators (flag components) REGIONAL_INDICATORS = (0x1F1E6, 0x1F1FF) # Regional indicator symbols A-Z # Double-width emoji ranges # # Many emoji characters are classified as single-width according to Unicode # Standard Annex #11 East Asian Width property (N or Neutral), but we # deliberately override them to be double-width. References: # 1. Unicode Technical Standard #51: Unicode Emoji # (https://www.unicode.org/reports/tr51/) # 2. Principle of "emoji presentation" in WHATWG CSS Text specification # (https://drafts.csswg.org/css-text-3/#character-properties) # 3. Terminal emulator implementations (iTerm2, Windows Terminal, etc.) which # universally render emoji as double-width characters regardless of their # Unicode EAW property # 4. W3C Work Item: Requirements for Japanese Text Layout - Section 3.8.1 # Emoji width (https://www.w3.org/TR/jlreq/) EMOJI_RANGES = [ (0x1F000, 0x1F02F), # Mahjong Tiles (EAW: N, but displayed as double-width) (0x1F0A0, 0x1F0FF), # Playing Cards (EAW: N, but displayed as double-width) (0x1F300, 0x1F5FF), # Miscellaneous Symbols and Pictographs (0x1F600, 0x1F64F), # Emoticons (0x1F680, 0x1F6FF), # Transport and Map Symbols (0x1F700, 0x1F77F), # Alchemical Symbols (0x1F780, 0x1F7FF), # Geometric Shapes Extended (0x1F800, 0x1F8FF), # Supplemental Arrows-C (0x1F900, 0x1F9FF), # Supplemental Symbols and Pictographs (0x1FA00, 0x1FA6F), # Chess Symbols (0x1FA70, 0x1FAFF), # Symbols and Pictographs Extended-A ] def create_width_tables(): """ Creates Unicode character width tables and returns the data structures. Returns: tuple: (zero_width_ranges, double_width_ranges) """ # Width data mapping width_map = {} # Maps code points to width (0, 1, 2) # Mark emoji modifiers as zero-width for start, end in EMOJI_ZERO_WIDTH: for cp in range(start, end + 1): width_map[cp] = 0 # Mark all regional indicators as single-width as they are usually paired # providing a combined width of 2 when displayed together. start, end = REGIONAL_INDICATORS for cp in range(start, end + 1): width_map[cp] = 1 # Process all assigned Unicode code points (Basic Multilingual Plane + # Supplementary Planes) Range 0x0 to 0x10FFFF (the full Unicode range) for block_start in range(0, 0x110000, 0x1000): block_end = block_start + 0x1000 for cp in range(block_start, block_end): try: char = chr(cp) # Skip if already processed if cp in width_map: continue # Check for combining marks and a format characters category = unicodedata.category(char) # Combining marks if category.startswith('M'): width_map[cp] = 0 continue # Format characters # Since we have no support for bidirectional text, all format # characters (category Cf) can be treated with width 0 (zero) # for simplicity, as they don't need to occupy visual space # in a non-bidirectional text environment. if category == 'Cf': width_map[cp] = 0 continue # Known zero-width characters if cp in KNOWN_ZERO_WIDTH: width_map[cp] = 0 continue # Use East Asian Width property eaw = unicodedata.east_asian_width(char) if eaw in ('F', 'W'): # Fullwidth or Wide width_map[cp] = 2 elif eaw in ('Na', 'H', 'N', 'A'): # Narrow, Halfwidth, Neutral, Ambiguous width_map[cp] = 1 else: # Default to single-width for unknown width_map[cp] = 1 except (ValueError, OverflowError): # Skip invalid code points continue # Process Emoji - generally double-width for start, end in EMOJI_RANGES: for cp in range(start, end + 1): if cp not in width_map or width_map[cp] != 0: # Don't override zero-width try: char = chr(cp) width_map[cp] = 2 except (ValueError, OverflowError): continue # Optimize to create range tables def ranges_optimize(width_data, target_width): points = sorted([cp for cp, width in width_data.items() if width == target_width]) if not points: return [] # Group consecutive code points into ranges ranges = [] start = points[0] prev = start for cp in points[1:]: if cp > prev + 1: ranges.append((start, prev)) start = cp prev = cp # Add the last range ranges.append((start, prev)) return ranges # Extract ranges for each width zero_width_ranges = ranges_optimize(width_map, 0) double_width_ranges = ranges_optimize(width_map, 2) return zero_width_ranges, double_width_ranges def write_tables(zero_width_ranges, double_width_ranges, out_file=DEFAULT_OUT_FILE): """ Write the generated tables to C header file. Args: zero_width_ranges: List of (start, end) ranges for zero-width characters double_width_ranges: List of (start, end) ranges for double-width characters out_file: Output file name (default: DEFAULT_OUT_FILE) """ # Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit) def split_ranges_by_size(ranges): bmp_ranges = [] non_bmp_ranges = [] for start, end in ranges: if end <= 0xFFFF: bmp_ranges.append((start, end)) elif start > 0xFFFF: non_bmp_ranges.append((start, end)) else: # Split the range at 0xFFFF bmp_ranges.append((start, 0xFFFF)) non_bmp_ranges.append((0x10000, end)) return bmp_ranges, non_bmp_ranges # Split ranges into BMP and non-BMP zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges) double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges) # Function to generate code point description comments def get_code_point_comment(start, end): try: start_char_desc = unicodedata.name(chr(start)) if start == end: return f"/* {start_char_desc} */" else: end_char_desc = unicodedata.name(chr(end)) return f"/* {start_char_desc} - {end_char_desc} */" except: if start == end: return f"/* U+{start:04X} */" else: return f"/* U+{start:04X} - U+{end:04X} */" # Generate C tables with open(out_file, 'w') as f: f.write(f"""\ /* SPDX-License-Identifier: GPL-2.0 */ /* * {out_file} - Unicode character width * * Auto-generated by {this_file} * * Unicode Version: {unicodedata.unidata_version} */ /* Zero-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */ static const struct ucs_interval16 ucs_zero_width_bmp_ranges[] = {{ """) for start, end in zero_width_bmp: comment = get_code_point_comment(start, end) f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n") f.write("""\ }; /* Zero-width character ranges (non-BMP, U+10000 and above) */ static const struct ucs_interval32 ucs_zero_width_non_bmp_ranges[] = { """) for start, end in zero_width_non_bmp: comment = get_code_point_comment(start, end) f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n") f.write("""\ }; /* Double-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */ static const struct ucs_interval16 ucs_double_width_bmp_ranges[] = { """) for start, end in double_width_bmp: comment = get_code_point_comment(start, end) f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n") f.write("""\ }; /* Double-width character ranges (non-BMP, U+10000 and above) */ static const struct ucs_interval32 ucs_double_width_non_bmp_ranges[] = { """) for start, end in double_width_non_bmp: comment = get_code_point_comment(start, end) f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n") f.write("};\n") if __name__ == "__main__": # Parse command line arguments parser = argparse.ArgumentParser(description="Generate Unicode width tables") parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE, help=f"Output file name (default: {DEFAULT_OUT_FILE})") args = parser.parse_args() # Write tables to header file zero_width_ranges, double_width_ranges = create_width_tables() write_tables(zero_width_ranges, double_width_ranges, out_file=args.output_file) # Print summary zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges) double_width_count = sum(end - start + 1 for start, end in double_width_ranges) print(f"Generated {args.output_file} with:") print(f"- {len(zero_width_ranges)} zero-width ranges covering ~{zero_width_count} code points") print(f"- {len(double_width_ranges)} double-width ranges covering ~{double_width_count} code points") print(f"- Unicode Version: {unicodedata.unidata_version}")