1582 lines
55 KiB
Plaintext
1582 lines
55 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "4502a404",
|
||
"metadata": {},
|
||
"source": [
|
||
"# x-icu\n",
|
||
"> **The lightweight bridge to ICU data.**"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "2baaf74f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Add libraries...\n",
|
||
"import os\n",
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import urllib.request\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "41361ca4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Configure what to get\n",
|
||
"CONFIG = {\n",
|
||
" 'sources':{\n",
|
||
" 'unicode':'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt',\n",
|
||
" },\n",
|
||
" 'unicode':{\n",
|
||
" # Unicode 1.0 name, considered \"old\"\n",
|
||
" # Set to None to use the old name if the\n",
|
||
" # current name is not descriptive\n",
|
||
" # e.g. use \"START OF TEXT\" instead of \"<control>\"\n",
|
||
" 'useOldName':True,\n",
|
||
" 'getName':True, # Formal name of the codepoint\n",
|
||
"\n",
|
||
" # PROBABLY REMOVE # 'getCategory':False,\n",
|
||
" # TODO # 'getScript': True, # e.g., \"Cyrillic\"\n",
|
||
" # TODO # 'getBlock': True, # e.g., \"Mathematical Alphanumeric Symbols\"\n",
|
||
"\n",
|
||
" 'getDecomposition':False, # aka \"ascii-fy\"\n",
|
||
" 'getDecompositionType':False, # metadata about decomposition\n",
|
||
"\n",
|
||
" 'toLowercase':True,\n",
|
||
" 'toUppercase':True,\n",
|
||
" 'toTitlecase':False,\n",
|
||
"\n",
|
||
" # TODO: 'isEmoji': True, # Does it have an emoji presentation?\n",
|
||
" 'isPunctuation': True, # General Category starts with 'P'\n",
|
||
" 'isSymbol': False, # General Category starts with 'S' (Math, Currency)\n",
|
||
" 'isCombining': False, # Is it a mark/accent that needs a base letter?\n",
|
||
"\n",
|
||
" 'isPrintable':False,\n",
|
||
" 'isSpace':True,\n",
|
||
" 'isWhitespace':True,\n",
|
||
" 'isLetter':True,\n",
|
||
" 'isUppercase':False,\n",
|
||
" 'isLowercase':False,\n",
|
||
" 'isTitlecase':False,\n",
|
||
" 'isDeprecated':False,\n",
|
||
"\n",
|
||
" # NOTE: The following fields work in a cascade,\n",
|
||
" # so if one field is true, the next one is also true.\n",
|
||
" # This is defined by unicode, we don't do any extra processing.\n",
|
||
" 'isDecimal':True, # An actual number 0-9, in different languages\n",
|
||
" 'isDigit':False, # Digits not used in standard positional notation,\n",
|
||
" # like superscripts or circled numbers.\n",
|
||
" # Usually used by search engines.\n",
|
||
" 'isNumberLike':False, # Neither of the previous two, but number like in nature, like 3/4\n",
|
||
" }\n",
|
||
"}\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "94696c64",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Decoders the UnicodeData.txt data\n",
|
||
"class UnicodeDataParser:\n",
|
||
"\n",
|
||
" def __init__(self):\n",
|
||
" self.buffer = []\n",
|
||
" self.range_start_row = None\n",
|
||
" self.keys = [\n",
|
||
" \"codepoint\", \"name\", \"category\", \"combining_class\", \"bidi_class\",\n",
|
||
" \"decomposition\", \"decimal_val\", \"digit_val\", \"numeric_val\",\n",
|
||
" \"bidi_mirrored\", \"unicode_1_name\", \"iso_comment\", \n",
|
||
" \"uppercase_map\", \"lowercase_map\", \"titlecase_map\"\n",
|
||
" ]\n",
|
||
"\n",
|
||
" def parse_line(self, text_line: str):\n",
|
||
" if not text_line.strip():\n",
|
||
" return\n",
|
||
" \n",
|
||
" parts = text_line.strip().split(';')\n",
|
||
" # Ensure we have 15 columns as per spec\n",
|
||
" if len(parts) < 15:\n",
|
||
" return\n",
|
||
" \n",
|
||
" row = dict(zip(self.keys, parts))\n",
|
||
" row['codepoint'] = int(row['codepoint'], 16)\n",
|
||
" name = row['name']\n",
|
||
"\n",
|
||
" # Detect Range Start: e.g., \"<CJK Ideograph, First>\"\n",
|
||
" if name.endswith(', First>'):\n",
|
||
" self.range_start_row = row\n",
|
||
" \n",
|
||
" # Detect Range End: e.g., \"<CJK Ideograph, Last>\"\n",
|
||
" elif name.endswith(', Last>') and self.range_start_row:\n",
|
||
" self._fill_range(row)\n",
|
||
" self.range_start_row = None\n",
|
||
" \n",
|
||
" # Standard single codepoint\n",
|
||
" else:\n",
|
||
" self.buffer.append( self.normalize(row) )\n",
|
||
"\n",
|
||
" def _fill_range(self, end_row):\n",
|
||
" # Linearly interpolates all codepoints between First and Last.\n",
|
||
" start_hex = self.range_start_row['codepoint']\n",
|
||
" end_hex = end_row['codepoint']\n",
|
||
" \n",
|
||
" # Generic name for the range (stripping the \", First>\" part)\n",
|
||
" base_name = self.range_start_row['name'].replace(', First>', '').replace('<', '')\n",
|
||
"\n",
|
||
" # Loop this range then\n",
|
||
" for cp in range(start_hex, end_hex + 1):\n",
|
||
" new_row = self.range_start_row.copy()\n",
|
||
" new_row['codepoint'] = cp\n",
|
||
" new_row['name'] = f\"<{base_name}>\"\n",
|
||
" self.buffer.append( self.normalize(new_row) )\n",
|
||
"\n",
|
||
" def normalize(self, in_row:dict):\n",
|
||
" # Now, parse the fields individually\n",
|
||
" out_row = {}\n",
|
||
"\n",
|
||
" # Parse codepoint\n",
|
||
" out_row['codepoint'] = in_row['codepoint']\n",
|
||
" char = chr(out_row['codepoint'])\n",
|
||
"\n",
|
||
" out_row['name'] = in_row['name']\n",
|
||
" out_row['old_name'] = in_row['unicode_1_name']\n",
|
||
"\n",
|
||
" out_row['decomposition'] = in_row['decomposition']\n",
|
||
"\n",
|
||
" # Extracts text inside <> like <compat> or <circle>\n",
|
||
" decomp = in_row['decomposition']\n",
|
||
" out_row['decomposition_type'] = decomp[1:decomp.find('>')] if '<' in decomp else None\n",
|
||
"\n",
|
||
" category = in_row['category']\n",
|
||
" out_row['punctuation'] = category.startswith('P') \n",
|
||
" out_row['symbol'] = category.startswith('S')\n",
|
||
" out_row['combining'] = category.startswith('M') # Mark category\n",
|
||
" out_row['letter'] = category.startswith('L')\n",
|
||
"\n",
|
||
" out_row['uppercase'] = category == 'Lu'\n",
|
||
" out_row['lowercase'] = category == 'Ll'\n",
|
||
" out_row['titlecase'] = category == 'Lt'\n",
|
||
"\n",
|
||
" # Mappings\n",
|
||
" for k in [\"uppercase_map\", \"lowercase_map\", \"titlecase_map\"]:\n",
|
||
" out_row[k] = int(in_row[k], 16) if in_row[k] != '' else 0\n",
|
||
"\n",
|
||
" # Zs = Space Separator, but also check common control whitespaces\n",
|
||
" out_row['whitespace'] = category == 'Zs' or char in '\\t\\n\\r\\f\\v'\n",
|
||
"\n",
|
||
" # Non-printable are usually Control (C) and some Separator (Z) categories\n",
|
||
" out_row['printable'] = not category.startswith('C')\n",
|
||
"\n",
|
||
" # Decimal (0-9) -> Digit (Superscripts) -> NumberLike (Fractions/Roman)\n",
|
||
" is_decimal = bool(in_row['decimal_val'])\n",
|
||
" is_digit = is_decimal or bool(in_row['digit_val'])\n",
|
||
" is_numeric = is_digit or bool(in_row['numeric_val'])\n",
|
||
"\n",
|
||
" out_row['decimal'] = is_decimal\n",
|
||
" out_row['digit'] = is_digit\n",
|
||
" out_row['number_like'] = is_numeric\n",
|
||
"\n",
|
||
" # return the data\n",
|
||
" return out_row\n",
|
||
"\n",
|
||
" def get_dataframe(self):\n",
|
||
" df = pd.DataFrame(self.buffer)\n",
|
||
" self.buffer.clear()\n",
|
||
" return df\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "cfb55a58",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Download the data\n",
|
||
"DATA_DIR = './data'\n",
|
||
"os.makedirs(DATA_DIR, exist_ok=True)\n",
|
||
"\n",
|
||
"def downloadFile(key:str, url:str):\n",
|
||
" urllib.request.urlretrieve(url, f'{DATA_DIR}/{key}.bin')\n",
|
||
"\n",
|
||
"for k in CONFIG['sources']:\n",
|
||
" downloadFile(k, CONFIG['sources'][k])\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "7dac9d1f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>codepoint</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>old_name</th>\n",
|
||
" <th>decomposition</th>\n",
|
||
" <th>decomposition_type</th>\n",
|
||
" <th>punctuation</th>\n",
|
||
" <th>symbol</th>\n",
|
||
" <th>combining</th>\n",
|
||
" <th>letter</th>\n",
|
||
" <th>uppercase</th>\n",
|
||
" <th>lowercase</th>\n",
|
||
" <th>titlecase</th>\n",
|
||
" <th>uppercase_map</th>\n",
|
||
" <th>lowercase_map</th>\n",
|
||
" <th>titlecase_map</th>\n",
|
||
" <th>whitespace</th>\n",
|
||
" <th>printable</th>\n",
|
||
" <th>decimal</th>\n",
|
||
" <th>digit</th>\n",
|
||
" <th>number_like</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td><control></td>\n",
|
||
" <td>NULL</td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td><control></td>\n",
|
||
" <td>START OF HEADING</td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td><control></td>\n",
|
||
" <td>START OF TEXT</td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td><control></td>\n",
|
||
" <td>END OF TEXT</td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td><control></td>\n",
|
||
" <td>END OF TRANSMISSION</td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299377</th>\n",
|
||
" <td>1114105</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299378</th>\n",
|
||
" <td>1114106</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299379</th>\n",
|
||
" <td>1114107</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299380</th>\n",
|
||
" <td>1114108</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299381</th>\n",
|
||
" <td>1114109</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>299382 rows × 20 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" codepoint name old_name decomposition \\\n",
|
||
"0 0 <control> NULL \n",
|
||
"1 1 <control> START OF HEADING \n",
|
||
"2 2 <control> START OF TEXT \n",
|
||
"3 3 <control> END OF TEXT \n",
|
||
"4 4 <control> END OF TRANSMISSION \n",
|
||
"... ... ... ... ... \n",
|
||
"299377 1114105 <Plane 16 Private Use> \n",
|
||
"299378 1114106 <Plane 16 Private Use> \n",
|
||
"299379 1114107 <Plane 16 Private Use> \n",
|
||
"299380 1114108 <Plane 16 Private Use> \n",
|
||
"299381 1114109 <Plane 16 Private Use> \n",
|
||
"\n",
|
||
" decomposition_type punctuation symbol combining letter uppercase \\\n",
|
||
"0 NaN False False False False False \n",
|
||
"1 NaN False False False False False \n",
|
||
"2 NaN False False False False False \n",
|
||
"3 NaN False False False False False \n",
|
||
"4 NaN False False False False False \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"299377 NaN False False False False False \n",
|
||
"299378 NaN False False False False False \n",
|
||
"299379 NaN False False False False False \n",
|
||
"299380 NaN False False False False False \n",
|
||
"299381 NaN False False False False False \n",
|
||
"\n",
|
||
" lowercase titlecase uppercase_map lowercase_map titlecase_map \\\n",
|
||
"0 False False 0 0 0 \n",
|
||
"1 False False 0 0 0 \n",
|
||
"2 False False 0 0 0 \n",
|
||
"3 False False 0 0 0 \n",
|
||
"4 False False 0 0 0 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"299377 False False 0 0 0 \n",
|
||
"299378 False False 0 0 0 \n",
|
||
"299379 False False 0 0 0 \n",
|
||
"299380 False False 0 0 0 \n",
|
||
"299381 False False 0 0 0 \n",
|
||
"\n",
|
||
" whitespace printable decimal digit number_like \n",
|
||
"0 False False False False False \n",
|
||
"1 False False False False False \n",
|
||
"2 False False False False False \n",
|
||
"3 False False False False False \n",
|
||
"4 False False False False False \n",
|
||
"... ... ... ... ... ... \n",
|
||
"299377 False False False False False \n",
|
||
"299378 False False False False False \n",
|
||
"299379 False False False False False \n",
|
||
"299380 False False False False False \n",
|
||
"299381 False False False False False \n",
|
||
"\n",
|
||
"[299382 rows x 20 columns]"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Now, for each contruct of the configuration, parse the file\n",
|
||
"# vvv UNICODE vvv #\n",
|
||
"unicode = UnicodeDataParser()\n",
|
||
"with open(f'{DATA_DIR}/unicode.bin', 'r') as file:\n",
|
||
" for line in file:\n",
|
||
" unicode.parse_line(line)\n",
|
||
"\n",
|
||
"df = unicode.get_dataframe()\n",
|
||
"df\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "2029f383",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>codepoint</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>old_name</th>\n",
|
||
" <th>decomposition</th>\n",
|
||
" <th>decomposition_type</th>\n",
|
||
" <th>punctuation</th>\n",
|
||
" <th>symbol</th>\n",
|
||
" <th>combining</th>\n",
|
||
" <th>letter</th>\n",
|
||
" <th>uppercase</th>\n",
|
||
" <th>lowercase</th>\n",
|
||
" <th>titlecase</th>\n",
|
||
" <th>uppercase_map</th>\n",
|
||
" <th>lowercase_map</th>\n",
|
||
" <th>titlecase_map</th>\n",
|
||
" <th>whitespace</th>\n",
|
||
" <th>printable</th>\n",
|
||
" <th>decimal</th>\n",
|
||
" <th>digit</th>\n",
|
||
" <th>number_like</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>160</th>\n",
|
||
" <td>160</td>\n",
|
||
" <td>NO-BREAK SPACE</td>\n",
|
||
" <td>NON-BREAKING SPACE</td>\n",
|
||
" <td><noBreak> 0020</td>\n",
|
||
" <td>noBreak</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>168</th>\n",
|
||
" <td>168</td>\n",
|
||
" <td>DIAERESIS</td>\n",
|
||
" <td>SPACING DIAERESIS</td>\n",
|
||
" <td><compat> 0020 0308</td>\n",
|
||
" <td>compat</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>170</th>\n",
|
||
" <td>170</td>\n",
|
||
" <td>FEMININE ORDINAL INDICATOR</td>\n",
|
||
" <td></td>\n",
|
||
" <td><super> 0061</td>\n",
|
||
" <td>super</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>175</th>\n",
|
||
" <td>175</td>\n",
|
||
" <td>MACRON</td>\n",
|
||
" <td>SPACING MACRON</td>\n",
|
||
" <td><compat> 0020 0304</td>\n",
|
||
" <td>compat</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>178</th>\n",
|
||
" <td>178</td>\n",
|
||
" <td>SUPERSCRIPT TWO</td>\n",
|
||
" <td>SUPERSCRIPT DIGIT TWO</td>\n",
|
||
" <td><super> 0032</td>\n",
|
||
" <td>super</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>93029</th>\n",
|
||
" <td>130037</td>\n",
|
||
" <td>SEGMENTED DIGIT FIVE</td>\n",
|
||
" <td></td>\n",
|
||
" <td><font> 0035</td>\n",
|
||
" <td>font</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>93030</th>\n",
|
||
" <td>130038</td>\n",
|
||
" <td>SEGMENTED DIGIT SIX</td>\n",
|
||
" <td></td>\n",
|
||
" <td><font> 0036</td>\n",
|
||
" <td>font</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>93031</th>\n",
|
||
" <td>130039</td>\n",
|
||
" <td>SEGMENTED DIGIT SEVEN</td>\n",
|
||
" <td></td>\n",
|
||
" <td><font> 0037</td>\n",
|
||
" <td>font</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>93032</th>\n",
|
||
" <td>130040</td>\n",
|
||
" <td>SEGMENTED DIGIT EIGHT</td>\n",
|
||
" <td></td>\n",
|
||
" <td><font> 0038</td>\n",
|
||
" <td>font</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>93033</th>\n",
|
||
" <td>130041</td>\n",
|
||
" <td>SEGMENTED DIGIT NINE</td>\n",
|
||
" <td></td>\n",
|
||
" <td><font> 0039</td>\n",
|
||
" <td>font</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>3833 rows × 20 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" codepoint name old_name \\\n",
|
||
"160 160 NO-BREAK SPACE NON-BREAKING SPACE \n",
|
||
"168 168 DIAERESIS SPACING DIAERESIS \n",
|
||
"170 170 FEMININE ORDINAL INDICATOR \n",
|
||
"175 175 MACRON SPACING MACRON \n",
|
||
"178 178 SUPERSCRIPT TWO SUPERSCRIPT DIGIT TWO \n",
|
||
"... ... ... ... \n",
|
||
"93029 130037 SEGMENTED DIGIT FIVE \n",
|
||
"93030 130038 SEGMENTED DIGIT SIX \n",
|
||
"93031 130039 SEGMENTED DIGIT SEVEN \n",
|
||
"93032 130040 SEGMENTED DIGIT EIGHT \n",
|
||
"93033 130041 SEGMENTED DIGIT NINE \n",
|
||
"\n",
|
||
" decomposition decomposition_type punctuation symbol combining \\\n",
|
||
"160 <noBreak> 0020 noBreak False False False \n",
|
||
"168 <compat> 0020 0308 compat False True False \n",
|
||
"170 <super> 0061 super False False False \n",
|
||
"175 <compat> 0020 0304 compat False True False \n",
|
||
"178 <super> 0032 super False False False \n",
|
||
"... ... ... ... ... ... \n",
|
||
"93029 <font> 0035 font False False False \n",
|
||
"93030 <font> 0036 font False False False \n",
|
||
"93031 <font> 0037 font False False False \n",
|
||
"93032 <font> 0038 font False False False \n",
|
||
"93033 <font> 0039 font False False False \n",
|
||
"\n",
|
||
" letter uppercase lowercase titlecase uppercase_map lowercase_map \\\n",
|
||
"160 False False False False 0 0 \n",
|
||
"168 False False False False 0 0 \n",
|
||
"170 True False False False 0 0 \n",
|
||
"175 False False False False 0 0 \n",
|
||
"178 False False False False 0 0 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"93029 False False False False 0 0 \n",
|
||
"93030 False False False False 0 0 \n",
|
||
"93031 False False False False 0 0 \n",
|
||
"93032 False False False False 0 0 \n",
|
||
"93033 False False False False 0 0 \n",
|
||
"\n",
|
||
" titlecase_map whitespace printable decimal digit number_like \n",
|
||
"160 0 True True False False False \n",
|
||
"168 0 False True False False False \n",
|
||
"170 0 False True False False False \n",
|
||
"175 0 False True False False False \n",
|
||
"178 0 False True False True True \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"93029 0 False True True True True \n",
|
||
"93030 0 False True True True True \n",
|
||
"93031 0 False True True True True \n",
|
||
"93032 0 False True True True True \n",
|
||
"93033 0 False True True True True \n",
|
||
"\n",
|
||
"[3833 rows x 20 columns]"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# TEST #\n",
|
||
"df[~df['decomposition_type'].isna()]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "d29860e8",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>cp_start</th>\n",
|
||
" <th>cp_end</th>\n",
|
||
" <th>punctuation</th>\n",
|
||
" <th>letter</th>\n",
|
||
" <th>whitespace</th>\n",
|
||
" <th>decimal</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>9</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>32</td>\n",
|
||
" <td>32</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>33</td>\n",
|
||
" <td>35</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>37</td>\n",
|
||
" <td>42</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>44</td>\n",
|
||
" <td>47</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>958</th>\n",
|
||
" <td>183984</td>\n",
|
||
" <td>191456</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>959</th>\n",
|
||
" <td>191472</td>\n",
|
||
" <td>192093</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>960</th>\n",
|
||
" <td>194560</td>\n",
|
||
" <td>195101</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>961</th>\n",
|
||
" <td>196608</td>\n",
|
||
" <td>201546</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>962</th>\n",
|
||
" <td>201552</td>\n",
|
||
" <td>210041</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>963 rows × 6 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" cp_start cp_end punctuation letter whitespace decimal\n",
|
||
"0 9 13 False False True False\n",
|
||
"1 32 32 False False True False\n",
|
||
"2 33 35 True False False False\n",
|
||
"3 37 42 True False False False\n",
|
||
"4 44 47 True False False False\n",
|
||
".. ... ... ... ... ... ...\n",
|
||
"958 183984 191456 False True False False\n",
|
||
"959 191472 192093 False True False False\n",
|
||
"960 194560 195101 False True False False\n",
|
||
"961 196608 201546 False True False False\n",
|
||
"962 201552 210041 False True False False\n",
|
||
"\n",
|
||
"[963 rows x 6 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>codepoint</th>\n",
|
||
" <th>uppercase_map</th>\n",
|
||
" <th>lowercase_map</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>65</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>32</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>66</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>32</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>67</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>32</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>68</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>32</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>69</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>32</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2984</th>\n",
|
||
" <td>125247</td>\n",
|
||
" <td>-34</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2985</th>\n",
|
||
" <td>125248</td>\n",
|
||
" <td>-34</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2986</th>\n",
|
||
" <td>125249</td>\n",
|
||
" <td>-34</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2987</th>\n",
|
||
" <td>125250</td>\n",
|
||
" <td>-34</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2988</th>\n",
|
||
" <td>125251</td>\n",
|
||
" <td>-34</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>2989 rows × 3 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" codepoint uppercase_map lowercase_map\n",
|
||
"0 65 0 32\n",
|
||
"1 66 0 32\n",
|
||
"2 67 0 32\n",
|
||
"3 68 0 32\n",
|
||
"4 69 0 32\n",
|
||
"... ... ... ...\n",
|
||
"2984 125247 -34 0\n",
|
||
"2985 125248 -34 0\n",
|
||
"2986 125249 -34 0\n",
|
||
"2987 125250 -34 0\n",
|
||
"2988 125251 -34 0\n",
|
||
"\n",
|
||
"[2989 rows x 3 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>codepoint</th>\n",
|
||
" <th>name</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NULL</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>START OF HEADING</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>START OF TEXT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>END OF TEXT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>END OF TRANSMISSION</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299377</th>\n",
|
||
" <td>1114105</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299378</th>\n",
|
||
" <td>1114106</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299379</th>\n",
|
||
" <td>1114107</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299380</th>\n",
|
||
" <td>1114108</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299381</th>\n",
|
||
" <td>1114109</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>299382 rows × 2 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" codepoint name\n",
|
||
"0 0 NULL\n",
|
||
"1 1 START OF HEADING\n",
|
||
"2 2 START OF TEXT\n",
|
||
"3 3 END OF TEXT\n",
|
||
"4 4 END OF TRANSMISSION\n",
|
||
"... ... ...\n",
|
||
"299377 1114105 <Plane 16 Private Use>\n",
|
||
"299378 1114106 <Plane 16 Private Use>\n",
|
||
"299379 1114107 <Plane 16 Private Use>\n",
|
||
"299380 1114108 <Plane 16 Private Use>\n",
|
||
"299381 1114109 <Plane 16 Private Use>\n",
|
||
"\n",
|
||
"[299382 rows x 2 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# UNICODE >> Apply config\n",
|
||
"u_cfg = CONFIG['unicode']\n",
|
||
"\n",
|
||
"# If useOldName is True, replace <control> or empty names with old_name\n",
|
||
"if u_cfg.get('useOldName'):\n",
|
||
" mask = (df['name'].str.startswith('<')) & (df['old_name'].str.len() > 0)\n",
|
||
" df.loc[mask, 'name'] = df.loc[mask, 'old_name']\n",
|
||
"\n",
|
||
"# Map CONFIG keys to actual DataFrame column names\n",
|
||
"mapping = {\n",
|
||
" 'getName': 'name',\n",
|
||
" 'getDecomposition': 'decomposition',\n",
|
||
" 'getDecompositionType': 'decomposition_type',\n",
|
||
" 'isPunctuation': 'punctuation',\n",
|
||
" 'isSymbol': 'symbol',\n",
|
||
" 'isCombining': 'combining',\n",
|
||
" 'isLetter': 'letter',\n",
|
||
" 'isUppercase': 'uppercase',\n",
|
||
" 'isLowercase': 'lowercase',\n",
|
||
" 'isTitlecase': 'titlecase',\n",
|
||
" 'toUppercase': 'uppercase_map',\n",
|
||
" 'toLowercase': 'lowercase_map',\n",
|
||
" 'toTitlecase': 'titlecase_map',\n",
|
||
" 'isWhitespace': 'whitespace',\n",
|
||
" 'isPrintable': 'printable',\n",
|
||
" 'isDecimal': 'decimal',\n",
|
||
" 'isDigit': 'digit',\n",
|
||
" 'isNumberLike': 'number_like'\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Categorize these keys\n",
|
||
"str_props = ['name','decomposition']\n",
|
||
"bool_props = [ y for x, y in mapping.items() if x.startswith('is') ]\n",
|
||
"int_props = [ 'decomposition_type', 'uppercase_map', 'lowercase_map', 'titlecase_map' ]\n",
|
||
"\n",
|
||
"keep_cols = ['codepoint'] # Always keep codepoint\n",
|
||
"keep_cols += [ col_name for cfg_key, col_name in mapping.items() if u_cfg.get(cfg_key) ]\n",
|
||
"\n",
|
||
"# Filter the dataframe\n",
|
||
"df_keep = df[keep_cols]\n",
|
||
"\n",
|
||
"# Place all string properties in another df\n",
|
||
"df_str = df_keep[ ['codepoint'] + [x for x in str_props if x in keep_cols] ]\n",
|
||
"df_int = df_keep[ ['codepoint'] + [x for x in int_props if x in keep_cols] ]\n",
|
||
"df_bool = df_keep[ ['codepoint'] + [x for x in bool_props if x in keep_cols] ]\n",
|
||
"\n",
|
||
"# Convert boolean-like columns to actual bools or int8 to save space\n",
|
||
"for k in bool_props:\n",
|
||
" if k in df_bool.columns:\n",
|
||
" # Convert to bool...\n",
|
||
" df_bool[k] = df_bool[k].fillna(False).astype(bool)\n",
|
||
"\n",
|
||
"\n",
|
||
"# Remove all rows which are completely false (boolean rows)\n",
|
||
"feature_cols = [col for col in df_bool.columns if col != 'codepoint']\n",
|
||
"df_bool = df_bool[(~(df_bool[feature_cols].astype(bool) == False).all(axis=1))]\n",
|
||
"\n",
|
||
"feature_cols = [col for col in df_int.columns if col != 'codepoint']\n",
|
||
"df_int = df_int[(~(df_int[feature_cols].astype(int) == 0).all(axis=1))]\n",
|
||
"\n",
|
||
"feature_cols = [col for col in df_str.columns if col != 'codepoint']\n",
|
||
"df_str = df_str[(~(df_str[feature_cols] == \"\").all(axis=1))]\n",
|
||
"\n",
|
||
"# cleanup all 3\n",
|
||
"df_bool = df_bool.sort_values('codepoint').reset_index(drop=True)\n",
|
||
"df_int = df_int .sort_values('codepoint').reset_index(drop=True)\n",
|
||
"df_str = df_str .sort_values('codepoint').reset_index(drop=True)\n",
|
||
"\n",
|
||
"# [BOOLEAN OPTIMIZATION]\n",
|
||
"# Identify where boolean changes (comparing current row to previous)\n",
|
||
"# We exclude the 'codepoint' from this comparison\n",
|
||
"metadata_cols = [c for c in df_bool.columns if c != 'codepoint']\n",
|
||
"metadata_changed = (df_bool[metadata_cols] != df_bool[metadata_cols].shift()).any(axis=1)\n",
|
||
"# Identify where the codepoint sequence breaks (gap > 1)\n",
|
||
"sequence_broken = (df_bool['codepoint'].diff() > 1)\n",
|
||
"# A new range starts if metadata changed OR the sequence broke\n",
|
||
"range_id = (metadata_changed | sequence_broken).cumsum()\n",
|
||
"# 4. Group by the range_id and aggregate\n",
|
||
"df_bool = df_bool.groupby(range_id).agg(\n",
|
||
" cp_start=('codepoint', 'min'),\n",
|
||
" cp_end=('codepoint', 'max'),\n",
|
||
" **{col: (col, 'first') for col in metadata_cols}\n",
|
||
")\n",
|
||
"df_bool = df_bool.reset_index(drop=True)\n",
|
||
"\n",
|
||
"# [INT OPTIMIZATION]\n",
|
||
"for k in ['uppercase_map', 'lowercase_map', 'titlecase_map']:\n",
|
||
" if k not in df_int.columns: continue\n",
|
||
" df_int[k] = df_int.apply(lambda x : x[k] - x['codepoint'] if x[k] != 0 else 0, axis=1)\n",
|
||
"\n",
|
||
"display(df_bool)\n",
|
||
"display(df_int)\n",
|
||
"display(df_str)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "73cf0f32",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"False"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"False"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"True"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# TEST #\n",
|
||
"display(df_str.empty)\n",
|
||
"display(df_bool.empty)\n",
|
||
"display(df_int.empty)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "2a0085c0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Format of binaries\n",
|
||
"# \"X-ICU\"\n",
|
||
"# <Same Byte Size:16 -- 0 if not same size>\n",
|
||
"# <Amount of ranges>\n",
|
||
"# <cp0><cp1><byte size of block:8 -- not present if same size>"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "a97b59a9",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# EXPORT UNICODE #\n",
|
||
"OUT_DIR = './out'\n",
|
||
"str_cols = [ 'name', 'decomposition' ]\n",
|
||
"cp_size = 4\n",
|
||
"same_size = CONFIG['general']['zeroPadStrings'] \\\n",
|
||
" or CONFIG['general']['exportStringsInDedicatedFile'] \\\n",
|
||
" or set(str_cols).isdisjoint(df.columns)\n",
|
||
"\n",
|
||
"\n",
|
||
"\n",
|
||
"# Select properties which are, and their size\n",
|
||
"for cfg_key, col_name in mapping.items():\n",
|
||
" if cfg_key in df.columns and cfg_key.startswith('is'):"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e75c5b2f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"True"
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "1d0ba29e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.14.3"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|