211 lines
6.8 KiB
Plaintext
211 lines
6.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4502a404",
|
|
"metadata": {},
|
|
"source": [
|
|
"# x-icu\n",
|
|
"> **The lightweight bridge to ICU data.**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2baaf74f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Add libraries..."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "41361ca4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Configure what to get\n",
|
|
"config = {\n",
|
|
" 'general':{\n",
|
|
" # If true, it will export the strings\n",
|
|
" # on its own dedicated file.\n",
|
|
" 'exportStringsInDedicatedFile':True,\n",
|
|
" # If true, each strings' block will\n",
|
|
" # be padded so they use an uniform size.\n",
|
|
" # This makes jumping between blocks very\n",
|
|
" # straightforward and fast.\n",
|
|
" 'zeroPadStrings':True,\n",
|
|
" },\n",
|
|
" 'sources':{\n",
|
|
" 'unicode':'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt',\n",
|
|
" },\n",
|
|
" 'unicode':{\n",
|
|
" # Unicode 1.0 name, considered \"old\"\n",
|
|
" # Set to None to use the old name if the\n",
|
|
" # current name is not descriptive\n",
|
|
" # e.g. use \"START OF TEXT\" instead of \"<control>\"\n",
|
|
" 'useOldName':False,\n",
|
|
" 'getName':False, # Formal name of the codepoint\n",
|
|
" # PROBABLY REMOVE # 'getCategory':False,\n",
|
|
" # TODO # 'getScript': True, # e.g., \"Cyrillic\"\n",
|
|
" # TODO # 'getBlock': True, # e.g., \"Mathematical Alphanumeric Symbols\"\n",
|
|
"\n",
|
|
" 'getDecomposition':False, # aka \"ascii-fy\"\n",
|
|
" 'getDecompositionType':False, # metadata about decomposition\n",
|
|
"\n",
|
|
" 'toLowercase':False,\n",
|
|
" 'toUppercase':False,\n",
|
|
" 'toTitlecase':False,\n",
|
|
"\n",
|
|
" 'isEmoji': True, # Does it have an emoji presentation?\n",
|
|
" 'isPunctuation': True, # General Category starts with 'P'\n",
|
|
" 'isSymbol': True, # General Category starts with 'S' (Math, Currency)\n",
|
|
" 'isCombining': True, # Is it a mark/accent that needs a base letter?\n",
|
|
"\n",
|
|
" 'isPrintable':True,\n",
|
|
" 'isSpace':True,\n",
|
|
" 'isWhitespace':True,\n",
|
|
" 'isLetter':True,\n",
|
|
" 'isUppercase':True,\n",
|
|
" 'isLowercase':True,\n",
|
|
" 'isTitlecase':True,\n",
|
|
" 'isDeprecated':True,\n",
|
|
"\n",
|
|
" # NOTE: The following fields work in a cascade,\n",
|
|
" # so if one field is true, the next one is also true.\n",
|
|
" 'isDecimal':True, # An actual number 0-9, in different languages\n",
|
|
" 'isDigit':True, # Digits not used in standard positional notation,\n",
|
|
" # like superscripts or circled numbers.\n",
|
|
" # Usually used by search engines.\n",
|
|
" 'isNumberLike':True, # Neither of the previous two, but number like in nature, like 3/4\n",
|
|
" }\n",
|
|
"}\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "94696c64",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def unicodeDataParseLine(text_line:str, config:dict):\n",
|
|
" # 10D84;GARAY SMALL LETTER OLD KA;Ll;0;R;;;;;N;;;10D64;;10D64\n",
|
|
" in_keys = [\n",
|
|
" \"codepoint\", \"name\", \"category\", \"combining_class\", \"bidi_class\",\n",
|
|
" \"decomposition\", \"decimal_val\", \"digit_val\", \"numeric_val\",\n",
|
|
" \"bidi_mirrored\", \"unicode_1_name\", \"iso_comment\", \n",
|
|
" \"uppercase_map\", \"lowercase_map\", \"titlecase_map\"\n",
|
|
" ]\n",
|
|
" parts = text_line.strip().split(';')\n",
|
|
" in_row = dict(zip(in_keys, parts))\n",
|
|
" \n",
|
|
" # Now, parse the fields individually\n",
|
|
" out_row = {}\n",
|
|
"\n",
|
|
" # Parse codepoint\n",
|
|
" out_row['codepoint'] = int(in_row['codepoint'], 16)\n",
|
|
" char = chr(out_row['codepoint'])\n",
|
|
"\n",
|
|
" out_row['name'] = in_row['name']\n",
|
|
" out_row['old_name'] = in_row['unicode_1_name']\n",
|
|
"\n",
|
|
" out_row['decomposition'] = in_row['decomposition']\n",
|
|
" \n",
|
|
" # Extracts text inside <> like <compat> or <circle>\n",
|
|
" decomp = in_row['decomposition']\n",
|
|
" out_row['decomposition_type'] = decomp[1:decomp.find('>')] if '<' in decomp else None\n",
|
|
"\n",
|
|
" category = in_row['category']\n",
|
|
" out_row['punctuation'] = category.startswith('P') \n",
|
|
" out_row['symbol'] = category.startswith('S')\n",
|
|
" out_row['combining'] = category.startswith('M') # Mark category\n",
|
|
" out_row['letter'] = category.startswith('L')\n",
|
|
"\n",
|
|
" out_row['uppercase'] = category == 'Lu'\n",
|
|
" out_row['lowercase'] = category == 'Ll'\n",
|
|
" out_row['titlecase'] = category == 'Lt'\n",
|
|
"\n",
|
|
" # Zs = Space Separator, but also check common control whitespaces\n",
|
|
" out_row['whitespace'] = category == 'Zs' or char in '\\t\\n\\r\\f\\v'\n",
|
|
" \n",
|
|
" # Non-printable are usually Control (C) and some Separator (Z) categories\n",
|
|
" out_row['printable'] = not category.startswith('C')\n",
|
|
"\n",
|
|
" # Decimal (0-9) -> Digit (Superscripts) -> NumberLike (Fractions/Roman)\n",
|
|
" is_decimal = bool(in_row['decimal_val'])\n",
|
|
" is_digit = is_decimal or bool(in_row['digit_val'])\n",
|
|
" is_numeric = is_digit or bool(in_row['numeric_val'])\n",
|
|
"\n",
|
|
" out_row['decimal'] = is_decimal\n",
|
|
" out_row['digit'] = is_digit\n",
|
|
" out_row['number_like'] = is_numeric\n",
|
|
"\n",
|
|
" # return the data\n",
|
|
" return out_row\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2a0085c0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Format of binaries\n",
|
|
"# [HEAD]\n",
|
|
"# <Amount of ranges>\n",
|
|
"# <cp0><cp1><byte size of block>\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "a97b59a9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e75c5b2f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1d0ba29e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.14.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|