diff options
Diffstat (limited to 'drivers/tty/vt/ucs.c')
-rw-r--r-- | drivers/tty/vt/ucs.c | 251 |
1 files changed, 251 insertions, 0 deletions
diff --git a/drivers/tty/vt/ucs.c b/drivers/tty/vt/ucs.c new file mode 100644 index 000000000000..6ead622b7713 --- /dev/null +++ b/drivers/tty/vt/ucs.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ucs.c - Universal Character Set processing + */ + +#include <linux/array_size.h> +#include <linux/bsearch.h> +#include <linux/consolemap.h> +#include <linux/minmax.h> + +struct ucs_interval16 { + u16 first; + u16 last; +}; + +struct ucs_interval32 { + u32 first; + u32 last; +}; + +#include "ucs_width_table.h" + +static int interval16_cmp(const void *key, const void *element) +{ + u16 cp = *(u16 *)key; + const struct ucs_interval16 *entry = element; + + if (cp < entry->first) + return -1; + if (cp > entry->last) + return 1; + return 0; +} + +static int interval32_cmp(const void *key, const void *element) +{ + u32 cp = *(u32 *)key; + const struct ucs_interval32 *entry = element; + + if (cp < entry->first) + return -1; + if (cp > entry->last) + return 1; + return 0; +} + +static bool cp_in_range16(u16 cp, const struct ucs_interval16 *ranges, size_t size) +{ + if (cp < ranges[0].first || cp > ranges[size - 1].last) + return false; + + return __inline_bsearch(&cp, ranges, size, sizeof(*ranges), + interval16_cmp) != NULL; +} + +static bool cp_in_range32(u32 cp, const struct ucs_interval32 *ranges, size_t size) +{ + if (cp < ranges[0].first || cp > ranges[size - 1].last) + return false; + + return __inline_bsearch(&cp, ranges, size, sizeof(*ranges), + interval32_cmp) != NULL; +} + +#define UCS_IS_BMP(cp) ((cp) <= 0xffff) + +/** + * ucs_is_zero_width() - Determine if a Unicode code point is zero-width. + * @cp: Unicode code point (UCS-4) + * + * Return: true if the character is zero-width, false otherwise + */ +bool ucs_is_zero_width(u32 cp) +{ + if (UCS_IS_BMP(cp)) + return cp_in_range16(cp, ucs_zero_width_bmp_ranges, + ARRAY_SIZE(ucs_zero_width_bmp_ranges)); + else + return cp_in_range32(cp, ucs_zero_width_non_bmp_ranges, + ARRAY_SIZE(ucs_zero_width_non_bmp_ranges)); +} + +/** + * ucs_is_double_width() - Determine if a Unicode code point is double-width. + * @cp: Unicode code point (UCS-4) + * + * Return: true if the character is double-width, false otherwise + */ +bool ucs_is_double_width(u32 cp) +{ + if (UCS_IS_BMP(cp)) + return cp_in_range16(cp, ucs_double_width_bmp_ranges, + ARRAY_SIZE(ucs_double_width_bmp_ranges)); + else + return cp_in_range32(cp, ucs_double_width_non_bmp_ranges, + ARRAY_SIZE(ucs_double_width_non_bmp_ranges)); +} + +/* + * Structure for base with combining mark pairs and resulting recompositions. + * Using u16 to save space since all values are within BMP range. + */ +struct ucs_recomposition { + u16 base; /* base character */ + u16 mark; /* combining mark */ + u16 recomposed; /* corresponding recomposed character */ +}; + +#include "ucs_recompose_table.h" + +struct compare_key { + u16 base; + u16 mark; +}; + +static int recomposition_cmp(const void *key, const void *element) +{ + const struct compare_key *search_key = key; + const struct ucs_recomposition *entry = element; + + /* Compare base character first */ + if (search_key->base < entry->base) + return -1; + if (search_key->base > entry->base) + return 1; + + /* Base characters match, now compare combining character */ + if (search_key->mark < entry->mark) + return -1; + if (search_key->mark > entry->mark) + return 1; + + /* Both match */ + return 0; +} + +/** + * ucs_recompose() - Attempt to recompose two Unicode characters into a single character. + * @base: Base Unicode code point (UCS-4) + * @mark: Combining mark Unicode code point (UCS-4) + * + * Return: Recomposed Unicode code point, or 0 if no recomposition is possible + */ +u32 ucs_recompose(u32 base, u32 mark) +{ + /* Check if characters are within the range of our table */ + if (base < UCS_RECOMPOSE_MIN_BASE || base > UCS_RECOMPOSE_MAX_BASE || + mark < UCS_RECOMPOSE_MIN_MARK || mark > UCS_RECOMPOSE_MAX_MARK) + return 0; + + struct compare_key key = { base, mark }; + struct ucs_recomposition *result = + __inline_bsearch(&key, ucs_recomposition_table, + ARRAY_SIZE(ucs_recomposition_table), + sizeof(*ucs_recomposition_table), + recomposition_cmp); + + return result ? result->recomposed : 0; +} + +/* + * The fallback table structures implement a 2-level lookup. + */ + +struct ucs_page_desc { + u8 page; /* Page index (high byte of code points) */ + u8 count; /* Number of entries in this page */ + u16 start; /* Start index in entries array */ +}; + +struct ucs_page_entry { + u8 offset; /* Offset within page (0-255) */ + u8 fallback; /* Fallback character or range start marker */ +}; + +#include "ucs_fallback_table.h" + +static int ucs_page_desc_cmp(const void *key, const void *element) +{ + u8 page = *(u8 *)key; + const struct ucs_page_desc *entry = element; + + if (page < entry->page) + return -1; + if (page > entry->page) + return 1; + return 0; +} + +static int ucs_page_entry_cmp(const void *key, const void *element) +{ + u8 offset = *(u8 *)key; + const struct ucs_page_entry *entry = element; + + if (offset < entry->offset) + return -1; + if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) { + if (offset > entry[1].offset) + return 1; + } else { + if (offset > entry->offset) + return 1; + } + return 0; +} + +/** + * ucs_get_fallback() - Get a substitution for the provided Unicode character + * @base: Base Unicode code point (UCS-4) + * + * Get a simpler fallback character for the provided Unicode character. + * This is used for terminal display when corresponding glyph is unavailable. + * The substitution may not be as good as the actual glyph for the original + * character but still way more helpful than a squared question mark. + * + * Return: Fallback Unicode code point, or 0 if none is available + */ +u32 ucs_get_fallback(u32 cp) +{ + const struct ucs_page_desc *page; + const struct ucs_page_entry *entry; + u8 page_idx = cp >> 8, offset = cp; + + if (!UCS_IS_BMP(cp)) + return 0; + + /* + * Full-width to ASCII mapping (covering all printable ASCII 33-126) + * 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~) + * We process them programmatically to reduce the table size. + */ + if (cp >= 0xFF01 && cp <= 0xFF5E) + return cp - 0xFF01 + 33; + + page = __inline_bsearch(&page_idx, ucs_fallback_pages, + ARRAY_SIZE(ucs_fallback_pages), + sizeof(*ucs_fallback_pages), + ucs_page_desc_cmp); + if (!page) + return 0; + + entry = __inline_bsearch(&offset, ucs_fallback_entries + page->start, + page->count, sizeof(*ucs_fallback_entries), + ucs_page_entry_cmp); + if (!entry) + return 0; + + if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) + entry++; + return entry->fallback; +} |