1631 lines
57 KiB
Plaintext
1631 lines
57 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "4502a404",
|
||
"metadata": {},
|
||
"source": [
|
||
"# x-icu\n",
|
||
"> **The lightweight bridge to ICU data.**"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "2baaf74f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Add libraries...\n",
|
||
"import os\n",
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import urllib.request\n",
|
||
"import math"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "41361ca4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Configure what to get\n",
|
||
"CONFIG = {\n",
|
||
" 'sources':{\n",
|
||
" 'unicode':'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt',\n",
|
||
" },\n",
|
||
" 'unicode':{\n",
|
||
" # Unicode 1.0 name, considered \"old\"\n",
|
||
" # Set to None to use the old name if the\n",
|
||
" # current name is not descriptive\n",
|
||
" # e.g. use \"START OF TEXT\" instead of \"<control>\"\n",
|
||
" 'useOldName':True,\n",
|
||
" 'getName':True, # Formal name of the codepoint\n",
|
||
"\n",
|
||
" # PROBABLY REMOVE # 'getCategory':False,\n",
|
||
" # TODO # 'getScript': True, # e.g., \"Cyrillic\"\n",
|
||
" # TODO # 'getBlock': True, # e.g., \"Mathematical Alphanumeric Symbols\"\n",
|
||
"\n",
|
||
" 'getDecomposition':False, # aka \"ascii-fy\"\n",
|
||
" 'getDecompositionType':False, # metadata about decomposition\n",
|
||
"\n",
|
||
" 'toLowercase':True,\n",
|
||
" 'toUppercase':True,\n",
|
||
" 'toTitlecase':False,\n",
|
||
"\n",
|
||
" # TODO: 'isEmoji': True, # Does it have an emoji presentation?\n",
|
||
" 'isPunctuation': True, # General Category starts with 'P'\n",
|
||
" 'isSymbol': False, # General Category starts with 'S' (Math, Currency)\n",
|
||
" 'isCombining': False, # Is it a mark/accent that needs a base letter?\n",
|
||
"\n",
|
||
" 'isPrintable':False,\n",
|
||
" 'isSpace':True,\n",
|
||
" 'isWhitespace':True,\n",
|
||
" 'isLetter':True,\n",
|
||
" 'isUppercase':False,\n",
|
||
" 'isLowercase':False,\n",
|
||
" 'isTitlecase':False,\n",
|
||
" 'isDeprecated':False,\n",
|
||
"\n",
|
||
" # NOTE: The following fields work in a cascade,\n",
|
||
" # so if one field is true, the next one is also true.\n",
|
||
" # This is defined by unicode, we don't do any extra processing.\n",
|
||
" 'isDecimal':True, # An actual number 0-9, in different languages\n",
|
||
" 'isDigit':False, # Digits not used in standard positional notation,\n",
|
||
" # like superscripts or circled numbers.\n",
|
||
" # Usually used by search engines.\n",
|
||
" 'isNumberLike':False, # Neither of the previous two, but number like in nature, like 3/4\n",
|
||
"\n",
|
||
" # These getters will get a numeric value out of the char\n",
|
||
" 'getDecimal': True,\n",
|
||
" 'getDigit': False,\n",
|
||
" 'getNumberLike': False,\n",
|
||
" }\n",
|
||
"}\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "94696c64",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Decoders the UnicodeData.txt data\n",
|
||
"class UnicodeDataParser:\n",
|
||
"\n",
|
||
" def __init__(self):\n",
|
||
" self.buffer = []\n",
|
||
" self.range_start_row = None\n",
|
||
" self.keys = [\n",
|
||
" \"codepoint\", \"name\", \"category\", \"combining_class\", \"bidi_class\",\n",
|
||
" \"decomposition\", \"decimal_val\", \"digit_val\", \"numeric_val\",\n",
|
||
" \"bidi_mirrored\", \"unicode_1_name\", \"iso_comment\", \n",
|
||
" \"uppercase_map\", \"lowercase_map\", \"titlecase_map\"\n",
|
||
" ]\n",
|
||
"\n",
|
||
" def parse_line(self, text_line: str):\n",
|
||
" if not text_line.strip():\n",
|
||
" return\n",
|
||
" \n",
|
||
" parts = text_line.strip().split(';')\n",
|
||
" # Ensure we have 15 columns as per spec\n",
|
||
" if len(parts) < 15:\n",
|
||
" return\n",
|
||
" \n",
|
||
" row = dict(zip(self.keys, parts))\n",
|
||
" row['codepoint'] = int(row['codepoint'], 16)\n",
|
||
" name = row['name']\n",
|
||
"\n",
|
||
" # Detect Range Start: e.g., \"<CJK Ideograph, First>\"\n",
|
||
" if name.endswith(', First>'):\n",
|
||
" self.range_start_row = row\n",
|
||
" \n",
|
||
" # Detect Range End: e.g., \"<CJK Ideograph, Last>\"\n",
|
||
" elif name.endswith(', Last>') and self.range_start_row:\n",
|
||
" self._fill_range(row)\n",
|
||
" self.range_start_row = None\n",
|
||
" \n",
|
||
" # Standard single codepoint\n",
|
||
" else:\n",
|
||
" self.buffer.append( self.normalize(row) )\n",
|
||
"\n",
|
||
" def _fill_range(self, end_row):\n",
|
||
" # Linearly interpolates all codepoints between First and Last.\n",
|
||
" start_hex = self.range_start_row['codepoint']\n",
|
||
" end_hex = end_row['codepoint']\n",
|
||
" \n",
|
||
" # Generic name for the range (stripping the \", First>\" part)\n",
|
||
" base_name = self.range_start_row['name'].replace(', First>', '').replace('<', '')\n",
|
||
"\n",
|
||
" # Loop this range then\n",
|
||
" for cp in range(start_hex, end_hex + 1):\n",
|
||
" new_row = self.range_start_row.copy()\n",
|
||
" new_row['codepoint'] = cp\n",
|
||
" new_row['name'] = f\"<{base_name}>\"\n",
|
||
" self.buffer.append( self.normalize(new_row) )\n",
|
||
"\n",
|
||
" def normalize(self, in_row:dict):\n",
|
||
" # Now, parse the fields individually\n",
|
||
" out_row = {}\n",
|
||
"\n",
|
||
" # Parse codepoint\n",
|
||
" out_row['codepoint'] = in_row['codepoint']\n",
|
||
" char = chr(out_row['codepoint'])\n",
|
||
"\n",
|
||
" out_row['name'] = in_row['name']\n",
|
||
" out_row['old_name'] = in_row['unicode_1_name']\n",
|
||
"\n",
|
||
" out_row['decomposition'] = in_row['decomposition']\n",
|
||
"\n",
|
||
" # Extracts text inside <> like <compat> or <circle>\n",
|
||
" decomp = in_row['decomposition']\n",
|
||
" out_row['decomposition_type'] = decomp[1:decomp.find('>')] if '<' in decomp else None\n",
|
||
"\n",
|
||
" category = in_row['category']\n",
|
||
" out_row['punctuation'] = category.startswith('P') \n",
|
||
" out_row['symbol'] = category.startswith('S')\n",
|
||
" out_row['combining'] = category.startswith('M') # Mark category\n",
|
||
" out_row['letter'] = category.startswith('L')\n",
|
||
"\n",
|
||
" out_row['uppercase'] = category == 'Lu'\n",
|
||
" out_row['lowercase'] = category == 'Ll'\n",
|
||
" out_row['titlecase'] = category == 'Lt'\n",
|
||
"\n",
|
||
" # Mappings\n",
|
||
" for k in [\"uppercase_map\", \"lowercase_map\", \"titlecase_map\"]:\n",
|
||
" out_row[k] = int(in_row[k], 16) if in_row[k] != '' else 0\n",
|
||
"\n",
|
||
" # Zs = Space Separator, but also check common control whitespaces\n",
|
||
" out_row['whitespace'] = category == 'Zs' or char in '\\t\\n\\r\\f\\v'\n",
|
||
"\n",
|
||
" # Non-printable are usually Control (C) and some Separator (Z) categories\n",
|
||
" out_row['printable'] = not category.startswith('C')\n",
|
||
"\n",
|
||
" # Decimal (0-9) -> Digit (Superscripts) -> NumberLike (Fractions/Roman)\n",
|
||
" is_decimal = bool(in_row['decimal_val'])\n",
|
||
" is_digit = is_decimal or bool(in_row['digit_val'])\n",
|
||
" is_numeric = is_digit or bool(in_row['numeric_val'])\n",
|
||
"\n",
|
||
" out_row['decimal'] = is_decimal\n",
|
||
" out_row['digit'] = is_digit\n",
|
||
" out_row['number_like'] = is_numeric\n",
|
||
"\n",
|
||
" out_row['decimal_val'] = int(in_row['decimal_val']) if is_decimal else 0\n",
|
||
" out_row['digit_val'] = int(in_row['digit_val']) if is_digit else 0\n",
|
||
" out_row['number_like_val'] = in_row['numeric_val'] if is_numeric else None\n",
|
||
"\n",
|
||
" # return the data\n",
|
||
" return out_row\n",
|
||
"\n",
|
||
" def get_dataframe(self):\n",
|
||
" df = pd.DataFrame(self.buffer)\n",
|
||
" self.buffer.clear()\n",
|
||
" return df\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "cfb55a58",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Download the data\n",
|
||
"DATA_DIR = './data'\n",
|
||
"os.makedirs(DATA_DIR, exist_ok=True)\n",
|
||
"\n",
|
||
"def downloadFile(key:str, url:str):\n",
|
||
" urllib.request.urlretrieve(url, f'{DATA_DIR}/{key}.bin')\n",
|
||
"\n",
|
||
"for k in CONFIG['sources']:\n",
|
||
" downloadFile(k, CONFIG['sources'][k])\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "7dac9d1f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>codepoint</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>old_name</th>\n",
|
||
" <th>decomposition</th>\n",
|
||
" <th>decomposition_type</th>\n",
|
||
" <th>punctuation</th>\n",
|
||
" <th>symbol</th>\n",
|
||
" <th>combining</th>\n",
|
||
" <th>letter</th>\n",
|
||
" <th>uppercase</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>lowercase_map</th>\n",
|
||
" <th>titlecase_map</th>\n",
|
||
" <th>whitespace</th>\n",
|
||
" <th>printable</th>\n",
|
||
" <th>decimal</th>\n",
|
||
" <th>digit</th>\n",
|
||
" <th>number_like</th>\n",
|
||
" <th>decimal_val</th>\n",
|
||
" <th>digit_val</th>\n",
|
||
" <th>number_like_val</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td><control></td>\n",
|
||
" <td>NULL</td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td><control></td>\n",
|
||
" <td>START OF HEADING</td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td><control></td>\n",
|
||
" <td>START OF TEXT</td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td><control></td>\n",
|
||
" <td>END OF TEXT</td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td><control></td>\n",
|
||
" <td>END OF TRANSMISSION</td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299377</th>\n",
|
||
" <td>1114105</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299378</th>\n",
|
||
" <td>1114106</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299379</th>\n",
|
||
" <td>1114107</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299380</th>\n",
|
||
" <td>1114108</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299381</th>\n",
|
||
" <td>1114109</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>299382 rows × 23 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" codepoint name old_name decomposition \\\n",
|
||
"0 0 <control> NULL \n",
|
||
"1 1 <control> START OF HEADING \n",
|
||
"2 2 <control> START OF TEXT \n",
|
||
"3 3 <control> END OF TEXT \n",
|
||
"4 4 <control> END OF TRANSMISSION \n",
|
||
"... ... ... ... ... \n",
|
||
"299377 1114105 <Plane 16 Private Use> \n",
|
||
"299378 1114106 <Plane 16 Private Use> \n",
|
||
"299379 1114107 <Plane 16 Private Use> \n",
|
||
"299380 1114108 <Plane 16 Private Use> \n",
|
||
"299381 1114109 <Plane 16 Private Use> \n",
|
||
"\n",
|
||
" decomposition_type punctuation symbol combining letter uppercase \\\n",
|
||
"0 NaN False False False False False \n",
|
||
"1 NaN False False False False False \n",
|
||
"2 NaN False False False False False \n",
|
||
"3 NaN False False False False False \n",
|
||
"4 NaN False False False False False \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"299377 NaN False False False False False \n",
|
||
"299378 NaN False False False False False \n",
|
||
"299379 NaN False False False False False \n",
|
||
"299380 NaN False False False False False \n",
|
||
"299381 NaN False False False False False \n",
|
||
"\n",
|
||
" ... lowercase_map titlecase_map whitespace printable decimal \\\n",
|
||
"0 ... 0 0 False False False \n",
|
||
"1 ... 0 0 False False False \n",
|
||
"2 ... 0 0 False False False \n",
|
||
"3 ... 0 0 False False False \n",
|
||
"4 ... 0 0 False False False \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"299377 ... 0 0 False False False \n",
|
||
"299378 ... 0 0 False False False \n",
|
||
"299379 ... 0 0 False False False \n",
|
||
"299380 ... 0 0 False False False \n",
|
||
"299381 ... 0 0 False False False \n",
|
||
"\n",
|
||
" digit number_like decimal_val digit_val number_like_val \n",
|
||
"0 False False 0 0 NaN \n",
|
||
"1 False False 0 0 NaN \n",
|
||
"2 False False 0 0 NaN \n",
|
||
"3 False False 0 0 NaN \n",
|
||
"4 False False 0 0 NaN \n",
|
||
"... ... ... ... ... ... \n",
|
||
"299377 False False 0 0 NaN \n",
|
||
"299378 False False 0 0 NaN \n",
|
||
"299379 False False 0 0 NaN \n",
|
||
"299380 False False 0 0 NaN \n",
|
||
"299381 False False 0 0 NaN \n",
|
||
"\n",
|
||
"[299382 rows x 23 columns]"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Now, for each contruct of the configuration, parse the file\n",
|
||
"# vvv UNICODE vvv #\n",
|
||
"unicode = UnicodeDataParser()\n",
|
||
"with open(f'{DATA_DIR}/unicode.bin', 'r') as file:\n",
|
||
" for line in file:\n",
|
||
" unicode.parse_line(line)\n",
|
||
"\n",
|
||
"df = unicode.get_dataframe()\n",
|
||
"df\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "2029f383",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>codepoint</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>old_name</th>\n",
|
||
" <th>decomposition</th>\n",
|
||
" <th>decomposition_type</th>\n",
|
||
" <th>punctuation</th>\n",
|
||
" <th>symbol</th>\n",
|
||
" <th>combining</th>\n",
|
||
" <th>letter</th>\n",
|
||
" <th>uppercase</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>lowercase_map</th>\n",
|
||
" <th>titlecase_map</th>\n",
|
||
" <th>whitespace</th>\n",
|
||
" <th>printable</th>\n",
|
||
" <th>decimal</th>\n",
|
||
" <th>digit</th>\n",
|
||
" <th>number_like</th>\n",
|
||
" <th>decimal_val</th>\n",
|
||
" <th>digit_val</th>\n",
|
||
" <th>number_like_val</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>48</th>\n",
|
||
" <td>48</td>\n",
|
||
" <td>DIGIT ZERO</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>49</th>\n",
|
||
" <td>49</td>\n",
|
||
" <td>DIGIT ONE</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50</th>\n",
|
||
" <td>50</td>\n",
|
||
" <td>DIGIT TWO</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>51</th>\n",
|
||
" <td>51</td>\n",
|
||
" <td>DIGIT THREE</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>52</th>\n",
|
||
" <td>52</td>\n",
|
||
" <td>DIGIT FOUR</td>\n",
|
||
" <td></td>\n",
|
||
" <td></td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>93029</th>\n",
|
||
" <td>130037</td>\n",
|
||
" <td>SEGMENTED DIGIT FIVE</td>\n",
|
||
" <td></td>\n",
|
||
" <td><font> 0035</td>\n",
|
||
" <td>font</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>5</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>93030</th>\n",
|
||
" <td>130038</td>\n",
|
||
" <td>SEGMENTED DIGIT SIX</td>\n",
|
||
" <td></td>\n",
|
||
" <td><font> 0036</td>\n",
|
||
" <td>font</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>93031</th>\n",
|
||
" <td>130039</td>\n",
|
||
" <td>SEGMENTED DIGIT SEVEN</td>\n",
|
||
" <td></td>\n",
|
||
" <td><font> 0037</td>\n",
|
||
" <td>font</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>7</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>93032</th>\n",
|
||
" <td>130040</td>\n",
|
||
" <td>SEGMENTED DIGIT EIGHT</td>\n",
|
||
" <td></td>\n",
|
||
" <td><font> 0038</td>\n",
|
||
" <td>font</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>8</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>93033</th>\n",
|
||
" <td>130041</td>\n",
|
||
" <td>SEGMENTED DIGIT NINE</td>\n",
|
||
" <td></td>\n",
|
||
" <td><font> 0039</td>\n",
|
||
" <td>font</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>9</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>770 rows × 23 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" codepoint name old_name decomposition \\\n",
|
||
"48 48 DIGIT ZERO \n",
|
||
"49 49 DIGIT ONE \n",
|
||
"50 50 DIGIT TWO \n",
|
||
"51 51 DIGIT THREE \n",
|
||
"52 52 DIGIT FOUR \n",
|
||
"... ... ... ... ... \n",
|
||
"93029 130037 SEGMENTED DIGIT FIVE <font> 0035 \n",
|
||
"93030 130038 SEGMENTED DIGIT SIX <font> 0036 \n",
|
||
"93031 130039 SEGMENTED DIGIT SEVEN <font> 0037 \n",
|
||
"93032 130040 SEGMENTED DIGIT EIGHT <font> 0038 \n",
|
||
"93033 130041 SEGMENTED DIGIT NINE <font> 0039 \n",
|
||
"\n",
|
||
" decomposition_type punctuation symbol combining letter uppercase \\\n",
|
||
"48 NaN False False False False False \n",
|
||
"49 NaN False False False False False \n",
|
||
"50 NaN False False False False False \n",
|
||
"51 NaN False False False False False \n",
|
||
"52 NaN False False False False False \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"93029 font False False False False False \n",
|
||
"93030 font False False False False False \n",
|
||
"93031 font False False False False False \n",
|
||
"93032 font False False False False False \n",
|
||
"93033 font False False False False False \n",
|
||
"\n",
|
||
" ... lowercase_map titlecase_map whitespace printable decimal \\\n",
|
||
"48 ... 0 0 False True True \n",
|
||
"49 ... 0 0 False True True \n",
|
||
"50 ... 0 0 False True True \n",
|
||
"51 ... 0 0 False True True \n",
|
||
"52 ... 0 0 False True True \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"93029 ... 0 0 False True True \n",
|
||
"93030 ... 0 0 False True True \n",
|
||
"93031 ... 0 0 False True True \n",
|
||
"93032 ... 0 0 False True True \n",
|
||
"93033 ... 0 0 False True True \n",
|
||
"\n",
|
||
" digit number_like decimal_val digit_val number_like_val \n",
|
||
"48 True True 0 0 0 \n",
|
||
"49 True True 1 1 1 \n",
|
||
"50 True True 2 2 2 \n",
|
||
"51 True True 3 3 3 \n",
|
||
"52 True True 4 4 4 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"93029 True True 5 5 5 \n",
|
||
"93030 True True 6 6 6 \n",
|
||
"93031 True True 7 7 7 \n",
|
||
"93032 True True 8 8 8 \n",
|
||
"93033 True True 9 9 9 \n",
|
||
"\n",
|
||
"[770 rows x 23 columns]"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# TEST #\n",
|
||
"df[df['decimal'] == True]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "d29860e8",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>cp_start</th>\n",
|
||
" <th>cp_end</th>\n",
|
||
" <th>punctuation</th>\n",
|
||
" <th>letter</th>\n",
|
||
" <th>whitespace</th>\n",
|
||
" <th>decimal</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>9</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>32</td>\n",
|
||
" <td>32</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>33</td>\n",
|
||
" <td>35</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>37</td>\n",
|
||
" <td>42</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>44</td>\n",
|
||
" <td>47</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>958</th>\n",
|
||
" <td>183984</td>\n",
|
||
" <td>191456</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>959</th>\n",
|
||
" <td>191472</td>\n",
|
||
" <td>192093</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>960</th>\n",
|
||
" <td>194560</td>\n",
|
||
" <td>195101</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>961</th>\n",
|
||
" <td>196608</td>\n",
|
||
" <td>201546</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>962</th>\n",
|
||
" <td>201552</td>\n",
|
||
" <td>210041</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>963 rows × 6 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" cp_start cp_end punctuation letter whitespace decimal\n",
|
||
"0 9 13 False False True False\n",
|
||
"1 32 32 False False True False\n",
|
||
"2 33 35 True False False False\n",
|
||
"3 37 42 True False False False\n",
|
||
"4 44 47 True False False False\n",
|
||
".. ... ... ... ... ... ...\n",
|
||
"958 183984 191456 False True False False\n",
|
||
"959 191472 192093 False True False False\n",
|
||
"960 194560 195101 False True False False\n",
|
||
"961 196608 201546 False True False False\n",
|
||
"962 201552 210041 False True False False\n",
|
||
"\n",
|
||
"[963 rows x 6 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>cp_start</th>\n",
|
||
" <th>cp_end</th>\n",
|
||
" <th>uppercase_map</th>\n",
|
||
" <th>lowercase_map</th>\n",
|
||
" <th>decimal_val</th>\n",
|
||
" <th>repeat</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>49</td>\n",
|
||
" <td>49</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>8</td>\n",
|
||
" <td>33</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>32</td>\n",
|
||
" <td>-9</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>32</td>\n",
|
||
" <td>32</td>\n",
|
||
" <td>-32</td>\n",
|
||
" <td>-32</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>84</td>\n",
|
||
" <td>59</td>\n",
|
||
" <td>775</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1509</th>\n",
|
||
" <td>34</td>\n",
|
||
" <td>34</td>\n",
|
||
" <td>-34</td>\n",
|
||
" <td>-34</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1510</th>\n",
|
||
" <td>47</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>34</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1511</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1512</th>\n",
|
||
" <td>4760</td>\n",
|
||
" <td>4760</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>-8</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1513</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>1514 rows × 6 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" cp_start cp_end uppercase_map lowercase_map decimal_val repeat\n",
|
||
"0 49 49 0 0 1 0\n",
|
||
"1 1 1 0 0 1 7\n",
|
||
"2 8 33 0 32 -9 0\n",
|
||
"3 32 32 -32 -32 0 0\n",
|
||
"4 84 59 775 0 0 0\n",
|
||
"... ... ... ... ... ... ...\n",
|
||
"1509 34 34 -34 -34 0 0\n",
|
||
"1510 47 14 34 0 1 0\n",
|
||
"1511 1 1 0 0 1 7\n",
|
||
"1512 4760 4760 0 0 -8 0\n",
|
||
"1513 1 1 0 0 1 7\n",
|
||
"\n",
|
||
"[1514 rows x 6 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>codepoint</th>\n",
|
||
" <th>name</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NULL</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>START OF HEADING</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>START OF TEXT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>END OF TEXT</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>END OF TRANSMISSION</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299377</th>\n",
|
||
" <td>1114105</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299378</th>\n",
|
||
" <td>1114106</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299379</th>\n",
|
||
" <td>1114107</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299380</th>\n",
|
||
" <td>1114108</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>299381</th>\n",
|
||
" <td>1114109</td>\n",
|
||
" <td><Plane 16 Private Use></td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>299382 rows × 2 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" codepoint name\n",
|
||
"0 0 NULL\n",
|
||
"1 1 START OF HEADING\n",
|
||
"2 2 START OF TEXT\n",
|
||
"3 3 END OF TEXT\n",
|
||
"4 4 END OF TRANSMISSION\n",
|
||
"... ... ...\n",
|
||
"299377 1114105 <Plane 16 Private Use>\n",
|
||
"299378 1114106 <Plane 16 Private Use>\n",
|
||
"299379 1114107 <Plane 16 Private Use>\n",
|
||
"299380 1114108 <Plane 16 Private Use>\n",
|
||
"299381 1114109 <Plane 16 Private Use>\n",
|
||
"\n",
|
||
"[299382 rows x 2 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# UNICODE >> Apply config\n",
|
||
"u_cfg = CONFIG['unicode']\n",
|
||
"\n",
|
||
"# If useOldName is True, replace <control> or empty names with old_name\n",
|
||
"if u_cfg.get('useOldName'):\n",
|
||
" mask = (df['name'].str.startswith('<')) & (df['old_name'].str.len() > 0)\n",
|
||
" df.loc[mask, 'name'] = df.loc[mask, 'old_name']\n",
|
||
"\n",
|
||
"# Map CONFIG keys to actual DataFrame column names\n",
|
||
"mapping = {\n",
|
||
" 'getName': 'name',\n",
|
||
" 'getDecomposition': 'decomposition',\n",
|
||
" 'getDecompositionType': 'decomposition_type',\n",
|
||
" 'isPunctuation': 'punctuation',\n",
|
||
" 'isSymbol': 'symbol',\n",
|
||
" 'isCombining': 'combining',\n",
|
||
" 'isLetter': 'letter',\n",
|
||
" 'isUppercase': 'uppercase',\n",
|
||
" 'isLowercase': 'lowercase',\n",
|
||
" 'isTitlecase': 'titlecase',\n",
|
||
" 'toUppercase': 'uppercase_map',\n",
|
||
" 'toLowercase': 'lowercase_map',\n",
|
||
" 'toTitlecase': 'titlecase_map',\n",
|
||
" 'isWhitespace': 'whitespace',\n",
|
||
" 'isPrintable': 'printable',\n",
|
||
" 'isDecimal': 'decimal',\n",
|
||
" 'isDigit': 'digit',\n",
|
||
" 'isNumberLike': 'number_like',\n",
|
||
" 'getDecimal': 'decimal_val',\n",
|
||
" 'getDigit': 'digit_val',\n",
|
||
" 'getNumberLike': 'number_like_val'\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Categorize these keys\n",
|
||
"str_props = ['name','decomposition']\n",
|
||
"bool_props = [ y for x, y in mapping.items() if x.startswith('is') ]\n",
|
||
"int_props = [\n",
|
||
" 'decomposition_type', 'uppercase_map', 'lowercase_map', 'titlecase_map',\n",
|
||
" 'decimal_val', 'digit_val', 'number_like_val'\n",
|
||
"]\n",
|
||
"\n",
|
||
"keep_cols = ['codepoint'] # Always keep codepoint\n",
|
||
"keep_cols += [ col_name for cfg_key, col_name in mapping.items() if u_cfg.get(cfg_key) ]\n",
|
||
"\n",
|
||
"# Filter the dataframe\n",
|
||
"df_keep = df[keep_cols]\n",
|
||
"\n",
|
||
"# Place all string properties in another df\n",
|
||
"df_str = df_keep[ ['codepoint'] + [x for x in str_props if x in keep_cols] ]\n",
|
||
"df_int = df_keep[ ['codepoint'] + [x for x in int_props if x in keep_cols] ]\n",
|
||
"df_bool = df_keep[ ['codepoint'] + [x for x in bool_props if x in keep_cols] ]\n",
|
||
"\n",
|
||
"# Convert boolean-like columns to actual bools or int8 to save space\n",
|
||
"for k in bool_props:\n",
|
||
" if k in df_bool.columns:\n",
|
||
" # Convert to bool...\n",
|
||
" df_bool[k] = df_bool[k].fillna(False).astype(bool)\n",
|
||
"\n",
|
||
"\n",
|
||
"# Remove all rows which are completely false (boolean rows)\n",
|
||
"feature_cols = [col for col in df_bool.columns if col != 'codepoint']\n",
|
||
"df_bool = df_bool[(~(df_bool[feature_cols].astype(bool) == False).all(axis=1))]\n",
|
||
"\n",
|
||
"feature_cols = [col for col in df_int.columns if col != 'codepoint']\n",
|
||
"df_int = df_int[(~(df_int[feature_cols].astype(int) == 0).all(axis=1))]\n",
|
||
"\n",
|
||
"feature_cols = [col for col in df_str.columns if col != 'codepoint']\n",
|
||
"df_str = df_str[(~(df_str[feature_cols] == \"\").all(axis=1))]\n",
|
||
"\n",
|
||
"# cleanup all 3\n",
|
||
"df_bool = df_bool.sort_values('codepoint').reset_index(drop=True)\n",
|
||
"df_int = df_int .sort_values('codepoint').reset_index(drop=True)\n",
|
||
"df_str = df_str .sort_values('codepoint').reset_index(drop=True)\n",
|
||
"\n",
|
||
"# [DELTA ENCODING]\n",
|
||
"def deltaEncode(df:pd.DataFrame, cols:list[str]):\n",
|
||
" changed = (df[cols] != df[cols].shift()).any(axis=1)\n",
|
||
" # Identify where the codepoint sequence breaks (gap > 1)\n",
|
||
" sequence_broken = (df['codepoint'].diff() > 1)\n",
|
||
" # A new range starts if metadata changed OR the sequence broke\n",
|
||
" range_id = (changed | sequence_broken).cumsum()\n",
|
||
" # 4. Group by the range_id and aggregate\n",
|
||
" df = df.groupby(range_id).agg(\n",
|
||
" cp_start=('codepoint', 'min'),\n",
|
||
" cp_end=('codepoint', 'max'),\n",
|
||
" **{col: (col, 'first') for col in cols}\n",
|
||
" )\n",
|
||
" df = df.reset_index(drop=True)\n",
|
||
" return df\n",
|
||
"\n",
|
||
"# [RUN LENGTH ENCODING]\n",
|
||
"def runLengthEncoding(df):\n",
|
||
" deltas = df.diff().fillna(df).astype(int)\n",
|
||
" change_mask = (deltas != deltas.shift()).any(axis=1)\n",
|
||
" group_ids = change_mask.cumsum()\n",
|
||
" encoded_df = deltas.groupby(group_ids).first()\n",
|
||
" encoded_df['repeat'] = deltas.groupby(group_ids).size() - 1\n",
|
||
" return encoded_df.reset_index(drop=True)\n",
|
||
"\n",
|
||
"# [BOOLEAN OPTIMIZATION]\n",
|
||
"df_bool = deltaEncode(df_bool, [c for c in df_bool.columns if c != 'codepoint'])\n",
|
||
"\n",
|
||
"# [INT OPTIMIZATION]\n",
|
||
"for k in ['uppercase_map', 'lowercase_map', 'titlecase_map']:\n",
|
||
" if k not in df_int.columns: continue # vvv make mapping relative vvv\n",
|
||
" df_int[k] = df_int.apply(lambda x : x[k] - x['codepoint'] if x[k] != 0 else 0, axis=1)\n",
|
||
"\n",
|
||
"df_int = deltaEncode(df_int, [c for c in df_int.columns if c != 'codepoint'])\n",
|
||
"df_int = runLengthEncoding(df_int)\n",
|
||
"\n",
|
||
"display(df_bool)\n",
|
||
"display(df_int)\n",
|
||
"display(df_str)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "3959426e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Size of boolean dataframe: 8667'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Size of int dataframe: 36336'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# TEST BINARY SIZE #\n",
|
||
"codepoint_size = 4\n",
|
||
"flag_size = math.ceil(1./8. * (len(df_bool.columns) - 2))\n",
|
||
"\n",
|
||
"display(f\"Size of boolean dataframe: {(2 * codepoint_size + flag_size) * len(df_bool)}\")\n",
|
||
"display(f\"Size of int dataframe: {(len(df_int.columns) * codepoint_size) * len(df_int)}\")\n",
|
||
"# TODO: size of string dataframe"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "a97b59a9",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# EXPORT UNICODE #\n",
|
||
"OUT_DIR = './out'\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e75c5b2f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "1d0ba29e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.14.3"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|