{ "cells": [ { "cell_type": "markdown", "id": "4502a404", "metadata": {}, "source": [ "# x-icu\n", "> **The lightweight bridge to ICU data.**" ] }, { "cell_type": "code", "execution_count": 19, "id": "2baaf74f", "metadata": {}, "outputs": [], "source": [ "# Add libraries...\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import urllib.request\n", "import math" ] }, { "cell_type": "code", "execution_count": 2, "id": "41361ca4", "metadata": {}, "outputs": [], "source": [ "# Configure what to get\n", "CONFIG = {\n", " 'sources':{\n", " 'unicode':'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt',\n", " },\n", " 'unicode':{\n", " # Unicode 1.0 name, considered \"old\"\n", " # Set to None to use the old name if the\n", " # current name is not descriptive\n", " # e.g. use \"START OF TEXT\" instead of \"\"\n", " 'useOldName':True,\n", " 'getName':True, # Formal name of the codepoint\n", "\n", " # PROBABLY REMOVE # 'getCategory':False,\n", " # TODO # 'getScript': True, # e.g., \"Cyrillic\"\n", " # TODO # 'getBlock': True, # e.g., \"Mathematical Alphanumeric Symbols\"\n", "\n", " 'getDecomposition':False, # aka \"ascii-fy\"\n", " 'getDecompositionType':False, # metadata about decomposition\n", "\n", " 'toLowercase':True,\n", " 'toUppercase':True,\n", " 'toTitlecase':False,\n", "\n", " # TODO: 'isEmoji': True, # Does it have an emoji presentation?\n", " 'isPunctuation': True, # General Category starts with 'P'\n", " 'isSymbol': False, # General Category starts with 'S' (Math, Currency)\n", " 'isCombining': False, # Is it a mark/accent that needs a base letter?\n", "\n", " 'isPrintable':False,\n", " 'isSpace':True,\n", " 'isWhitespace':True,\n", " 'isLetter':True,\n", " 'isUppercase':False,\n", " 'isLowercase':False,\n", " 'isTitlecase':False,\n", " 'isDeprecated':False,\n", "\n", " # NOTE: The following fields work in a cascade,\n", " # so if one field is true, the next one is also true.\n", " # This is defined by unicode, we don't do any extra processing.\n", " 'isDecimal':True, # An actual number 0-9, in different languages\n", " 'isDigit':False, # Digits not used in standard positional notation,\n", " # like superscripts or circled numbers.\n", " # Usually used by search engines.\n", " 'isNumberLike':False, # Neither of the previous two, but number like in nature, like 3/4\n", "\n", " # These getters will get a numeric value out of the char\n", " 'getDecimal': True,\n", " 'getDigit': False,\n", " 'getNumberLike': False,\n", " }\n", "}\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "94696c64", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# Decoders the UnicodeData.txt data\n", "class UnicodeDataParser:\n", "\n", " def __init__(self):\n", " self.buffer = []\n", " self.range_start_row = None\n", " self.keys = [\n", " \"codepoint\", \"name\", \"category\", \"combining_class\", \"bidi_class\",\n", " \"decomposition\", \"decimal_val\", \"digit_val\", \"numeric_val\",\n", " \"bidi_mirrored\", \"unicode_1_name\", \"iso_comment\", \n", " \"uppercase_map\", \"lowercase_map\", \"titlecase_map\"\n", " ]\n", "\n", " def parse_line(self, text_line: str):\n", " if not text_line.strip():\n", " return\n", " \n", " parts = text_line.strip().split(';')\n", " # Ensure we have 15 columns as per spec\n", " if len(parts) < 15:\n", " return\n", " \n", " row = dict(zip(self.keys, parts))\n", " row['codepoint'] = int(row['codepoint'], 16)\n", " name = row['name']\n", "\n", " # Detect Range Start: e.g., \"\"\n", " if name.endswith(', First>'):\n", " self.range_start_row = row\n", " \n", " # Detect Range End: e.g., \"\"\n", " elif name.endswith(', Last>') and self.range_start_row:\n", " self._fill_range(row)\n", " self.range_start_row = None\n", " \n", " # Standard single codepoint\n", " else:\n", " self.buffer.append( self.normalize(row) )\n", "\n", " def _fill_range(self, end_row):\n", " # Linearly interpolates all codepoints between First and Last.\n", " start_hex = self.range_start_row['codepoint']\n", " end_hex = end_row['codepoint']\n", " \n", " # Generic name for the range (stripping the \", First>\" part)\n", " base_name = self.range_start_row['name'].replace(', First>', '').replace('<', '')\n", "\n", " # Loop this range then\n", " for cp in range(start_hex, end_hex + 1):\n", " new_row = self.range_start_row.copy()\n", " new_row['codepoint'] = cp\n", " new_row['name'] = f\"<{base_name}>\"\n", " self.buffer.append( self.normalize(new_row) )\n", "\n", " def normalize(self, in_row:dict):\n", " # Now, parse the fields individually\n", " out_row = {}\n", "\n", " # Parse codepoint\n", " out_row['codepoint'] = in_row['codepoint']\n", " char = chr(out_row['codepoint'])\n", "\n", " out_row['name'] = in_row['name']\n", " out_row['old_name'] = in_row['unicode_1_name']\n", "\n", " out_row['decomposition'] = in_row['decomposition']\n", "\n", " # Extracts text inside <> like or \n", " decomp = in_row['decomposition']\n", " out_row['decomposition_type'] = decomp[1:decomp.find('>')] if '<' in decomp else None\n", "\n", " category = in_row['category']\n", " out_row['punctuation'] = category.startswith('P') \n", " out_row['symbol'] = category.startswith('S')\n", " out_row['combining'] = category.startswith('M') # Mark category\n", " out_row['letter'] = category.startswith('L')\n", "\n", " out_row['uppercase'] = category == 'Lu'\n", " out_row['lowercase'] = category == 'Ll'\n", " out_row['titlecase'] = category == 'Lt'\n", "\n", " # Mappings\n", " for k in [\"uppercase_map\", \"lowercase_map\", \"titlecase_map\"]:\n", " out_row[k] = int(in_row[k], 16) if in_row[k] != '' else 0\n", "\n", " # Zs = Space Separator, but also check common control whitespaces\n", " out_row['whitespace'] = category == 'Zs' or char in '\\t\\n\\r\\f\\v'\n", "\n", " # Non-printable are usually Control (C) and some Separator (Z) categories\n", " out_row['printable'] = not category.startswith('C')\n", "\n", " # Decimal (0-9) -> Digit (Superscripts) -> NumberLike (Fractions/Roman)\n", " is_decimal = bool(in_row['decimal_val'])\n", " is_digit = is_decimal or bool(in_row['digit_val'])\n", " is_numeric = is_digit or bool(in_row['numeric_val'])\n", "\n", " out_row['decimal'] = is_decimal\n", " out_row['digit'] = is_digit\n", " out_row['number_like'] = is_numeric\n", "\n", " out_row['decimal_val'] = int(in_row['decimal_val']) if is_decimal else 0\n", " out_row['digit_val'] = int(in_row['digit_val']) if is_digit else 0\n", " out_row['number_like_val'] = in_row['numeric_val'] if is_numeric else None\n", "\n", " # return the data\n", " return out_row\n", "\n", " def get_dataframe(self):\n", " df = pd.DataFrame(self.buffer)\n", " self.buffer.clear()\n", " return df\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "cfb55a58", "metadata": {}, "outputs": [], "source": [ "# Download the data\n", "DATA_DIR = './data'\n", "os.makedirs(DATA_DIR, exist_ok=True)\n", "\n", "def downloadFile(key:str, url:str):\n", " urllib.request.urlretrieve(url, f'{DATA_DIR}/{key}.bin')\n", "\n", "for k in CONFIG['sources']:\n", " downloadFile(k, CONFIG['sources'][k])\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "7dac9d1f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
codepointnameold_namedecompositiondecomposition_typepunctuationsymbolcombiningletteruppercase...lowercase_maptitlecase_mapwhitespaceprintabledecimaldigitnumber_likedecimal_valdigit_valnumber_like_val
00<control>NULLNaNFalseFalseFalseFalseFalse...00FalseFalseFalseFalseFalse00NaN
11<control>START OF HEADINGNaNFalseFalseFalseFalseFalse...00FalseFalseFalseFalseFalse00NaN
22<control>START OF TEXTNaNFalseFalseFalseFalseFalse...00FalseFalseFalseFalseFalse00NaN
33<control>END OF TEXTNaNFalseFalseFalseFalseFalse...00FalseFalseFalseFalseFalse00NaN
44<control>END OF TRANSMISSIONNaNFalseFalseFalseFalseFalse...00FalseFalseFalseFalseFalse00NaN
..................................................................
2993771114105<Plane 16 Private Use>NaNFalseFalseFalseFalseFalse...00FalseFalseFalseFalseFalse00NaN
2993781114106<Plane 16 Private Use>NaNFalseFalseFalseFalseFalse...00FalseFalseFalseFalseFalse00NaN
2993791114107<Plane 16 Private Use>NaNFalseFalseFalseFalseFalse...00FalseFalseFalseFalseFalse00NaN
2993801114108<Plane 16 Private Use>NaNFalseFalseFalseFalseFalse...00FalseFalseFalseFalseFalse00NaN
2993811114109<Plane 16 Private Use>NaNFalseFalseFalseFalseFalse...00FalseFalseFalseFalseFalse00NaN
\n", "

299382 rows × 23 columns

\n", "
" ], "text/plain": [ " codepoint name old_name decomposition \\\n", "0 0 NULL \n", "1 1 START OF HEADING \n", "2 2 START OF TEXT \n", "3 3 END OF TEXT \n", "4 4 END OF TRANSMISSION \n", "... ... ... ... ... \n", "299377 1114105 \n", "299378 1114106 \n", "299379 1114107 \n", "299380 1114108 \n", "299381 1114109 \n", "\n", " decomposition_type punctuation symbol combining letter uppercase \\\n", "0 NaN False False False False False \n", "1 NaN False False False False False \n", "2 NaN False False False False False \n", "3 NaN False False False False False \n", "4 NaN False False False False False \n", "... ... ... ... ... ... ... \n", "299377 NaN False False False False False \n", "299378 NaN False False False False False \n", "299379 NaN False False False False False \n", "299380 NaN False False False False False \n", "299381 NaN False False False False False \n", "\n", " ... lowercase_map titlecase_map whitespace printable decimal \\\n", "0 ... 0 0 False False False \n", "1 ... 0 0 False False False \n", "2 ... 0 0 False False False \n", "3 ... 0 0 False False False \n", "4 ... 0 0 False False False \n", "... ... ... ... ... ... ... \n", "299377 ... 0 0 False False False \n", "299378 ... 0 0 False False False \n", "299379 ... 0 0 False False False \n", "299380 ... 0 0 False False False \n", "299381 ... 0 0 False False False \n", "\n", " digit number_like decimal_val digit_val number_like_val \n", "0 False False 0 0 NaN \n", "1 False False 0 0 NaN \n", "2 False False 0 0 NaN \n", "3 False False 0 0 NaN \n", "4 False False 0 0 NaN \n", "... ... ... ... ... ... \n", "299377 False False 0 0 NaN \n", "299378 False False 0 0 NaN \n", "299379 False False 0 0 NaN \n", "299380 False False 0 0 NaN \n", "299381 False False 0 0 NaN \n", "\n", "[299382 rows x 23 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now, for each contruct of the configuration, parse the file\n", "# vvv UNICODE vvv #\n", "unicode = UnicodeDataParser()\n", "with open(f'{DATA_DIR}/unicode.bin', 'r') as file:\n", " for line in file:\n", " unicode.parse_line(line)\n", "\n", "df = unicode.get_dataframe()\n", "df\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "2029f383", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
codepointnameold_namedecompositiondecomposition_typepunctuationsymbolcombiningletteruppercase...lowercase_maptitlecase_mapwhitespaceprintabledecimaldigitnumber_likedecimal_valdigit_valnumber_like_val
4848DIGIT ZERONaNFalseFalseFalseFalseFalse...00FalseTrueTrueTrueTrue000
4949DIGIT ONENaNFalseFalseFalseFalseFalse...00FalseTrueTrueTrueTrue111
5050DIGIT TWONaNFalseFalseFalseFalseFalse...00FalseTrueTrueTrueTrue222
5151DIGIT THREENaNFalseFalseFalseFalseFalse...00FalseTrueTrueTrueTrue333
5252DIGIT FOURNaNFalseFalseFalseFalseFalse...00FalseTrueTrueTrueTrue444
..................................................................
93029130037SEGMENTED DIGIT FIVE<font> 0035fontFalseFalseFalseFalseFalse...00FalseTrueTrueTrueTrue555
93030130038SEGMENTED DIGIT SIX<font> 0036fontFalseFalseFalseFalseFalse...00FalseTrueTrueTrueTrue666
93031130039SEGMENTED DIGIT SEVEN<font> 0037fontFalseFalseFalseFalseFalse...00FalseTrueTrueTrueTrue777
93032130040SEGMENTED DIGIT EIGHT<font> 0038fontFalseFalseFalseFalseFalse...00FalseTrueTrueTrueTrue888
93033130041SEGMENTED DIGIT NINE<font> 0039fontFalseFalseFalseFalseFalse...00FalseTrueTrueTrueTrue999
\n", "

770 rows × 23 columns

\n", "
" ], "text/plain": [ " codepoint name old_name decomposition \\\n", "48 48 DIGIT ZERO \n", "49 49 DIGIT ONE \n", "50 50 DIGIT TWO \n", "51 51 DIGIT THREE \n", "52 52 DIGIT FOUR \n", "... ... ... ... ... \n", "93029 130037 SEGMENTED DIGIT FIVE 0035 \n", "93030 130038 SEGMENTED DIGIT SIX 0036 \n", "93031 130039 SEGMENTED DIGIT SEVEN 0037 \n", "93032 130040 SEGMENTED DIGIT EIGHT 0038 \n", "93033 130041 SEGMENTED DIGIT NINE 0039 \n", "\n", " decomposition_type punctuation symbol combining letter uppercase \\\n", "48 NaN False False False False False \n", "49 NaN False False False False False \n", "50 NaN False False False False False \n", "51 NaN False False False False False \n", "52 NaN False False False False False \n", "... ... ... ... ... ... ... \n", "93029 font False False False False False \n", "93030 font False False False False False \n", "93031 font False False False False False \n", "93032 font False False False False False \n", "93033 font False False False False False \n", "\n", " ... lowercase_map titlecase_map whitespace printable decimal \\\n", "48 ... 0 0 False True True \n", "49 ... 0 0 False True True \n", "50 ... 0 0 False True True \n", "51 ... 0 0 False True True \n", "52 ... 0 0 False True True \n", "... ... ... ... ... ... ... \n", "93029 ... 0 0 False True True \n", "93030 ... 0 0 False True True \n", "93031 ... 0 0 False True True \n", "93032 ... 0 0 False True True \n", "93033 ... 0 0 False True True \n", "\n", " digit number_like decimal_val digit_val number_like_val \n", "48 True True 0 0 0 \n", "49 True True 1 1 1 \n", "50 True True 2 2 2 \n", "51 True True 3 3 3 \n", "52 True True 4 4 4 \n", "... ... ... ... ... ... \n", "93029 True True 5 5 5 \n", "93030 True True 6 6 6 \n", "93031 True True 7 7 7 \n", "93032 True True 8 8 8 \n", "93033 True True 9 9 9 \n", "\n", "[770 rows x 23 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# TEST #\n", "df[df['decimal'] == True]" ] }, { "cell_type": "code", "execution_count": 18, "id": "d29860e8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cp_startcp_endpunctuationletterwhitespacedecimal
0913FalseFalseTrueFalse
13232FalseFalseTrueFalse
23335TrueFalseFalseFalse
33742TrueFalseFalseFalse
44447TrueFalseFalseFalse
.....................
958183984191456FalseTrueFalseFalse
959191472192093FalseTrueFalseFalse
960194560195101FalseTrueFalseFalse
961196608201546FalseTrueFalseFalse
962201552210041FalseTrueFalseFalse
\n", "

963 rows × 6 columns

\n", "
" ], "text/plain": [ " cp_start cp_end punctuation letter whitespace decimal\n", "0 9 13 False False True False\n", "1 32 32 False False True False\n", "2 33 35 True False False False\n", "3 37 42 True False False False\n", "4 44 47 True False False False\n", ".. ... ... ... ... ... ...\n", "958 183984 191456 False True False False\n", "959 191472 192093 False True False False\n", "960 194560 195101 False True False False\n", "961 196608 201546 False True False False\n", "962 201552 210041 False True False False\n", "\n", "[963 rows x 6 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cp_startcp_enduppercase_maplowercase_mapdecimal_valrepeat
049490010
1110017
2833032-90
33232-32-3200
48459775000
.....................
15093434-34-3400
1510471434010
1511110017
15124760476000-80
1513110017
\n", "

1514 rows × 6 columns

\n", "
" ], "text/plain": [ " cp_start cp_end uppercase_map lowercase_map decimal_val repeat\n", "0 49 49 0 0 1 0\n", "1 1 1 0 0 1 7\n", "2 8 33 0 32 -9 0\n", "3 32 32 -32 -32 0 0\n", "4 84 59 775 0 0 0\n", "... ... ... ... ... ... ...\n", "1509 34 34 -34 -34 0 0\n", "1510 47 14 34 0 1 0\n", "1511 1 1 0 0 1 7\n", "1512 4760 4760 0 0 -8 0\n", "1513 1 1 0 0 1 7\n", "\n", "[1514 rows x 6 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
codepointname
00NULL
11START OF HEADING
22START OF TEXT
33END OF TEXT
44END OF TRANSMISSION
.........
2993771114105<Plane 16 Private Use>
2993781114106<Plane 16 Private Use>
2993791114107<Plane 16 Private Use>
2993801114108<Plane 16 Private Use>
2993811114109<Plane 16 Private Use>
\n", "

299382 rows × 2 columns

\n", "
" ], "text/plain": [ " codepoint name\n", "0 0 NULL\n", "1 1 START OF HEADING\n", "2 2 START OF TEXT\n", "3 3 END OF TEXT\n", "4 4 END OF TRANSMISSION\n", "... ... ...\n", "299377 1114105 \n", "299378 1114106 \n", "299379 1114107 \n", "299380 1114108 \n", "299381 1114109 \n", "\n", "[299382 rows x 2 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# UNICODE >> Apply config\n", "u_cfg = CONFIG['unicode']\n", "\n", "# If useOldName is True, replace or empty names with old_name\n", "if u_cfg.get('useOldName'):\n", " mask = (df['name'].str.startswith('<')) & (df['old_name'].str.len() > 0)\n", " df.loc[mask, 'name'] = df.loc[mask, 'old_name']\n", "\n", "# Map CONFIG keys to actual DataFrame column names\n", "mapping = {\n", " 'getName': 'name',\n", " 'getDecomposition': 'decomposition',\n", " 'getDecompositionType': 'decomposition_type',\n", " 'isPunctuation': 'punctuation',\n", " 'isSymbol': 'symbol',\n", " 'isCombining': 'combining',\n", " 'isLetter': 'letter',\n", " 'isUppercase': 'uppercase',\n", " 'isLowercase': 'lowercase',\n", " 'isTitlecase': 'titlecase',\n", " 'toUppercase': 'uppercase_map',\n", " 'toLowercase': 'lowercase_map',\n", " 'toTitlecase': 'titlecase_map',\n", " 'isWhitespace': 'whitespace',\n", " 'isPrintable': 'printable',\n", " 'isDecimal': 'decimal',\n", " 'isDigit': 'digit',\n", " 'isNumberLike': 'number_like',\n", " 'getDecimal': 'decimal_val',\n", " 'getDigit': 'digit_val',\n", " 'getNumberLike': 'number_like_val'\n", "}\n", "\n", "# Categorize these keys\n", "str_props = ['name','decomposition']\n", "bool_props = [ y for x, y in mapping.items() if x.startswith('is') ]\n", "int_props = [\n", " 'decomposition_type', 'uppercase_map', 'lowercase_map', 'titlecase_map',\n", " 'decimal_val', 'digit_val', 'number_like_val'\n", "]\n", "\n", "keep_cols = ['codepoint'] # Always keep codepoint\n", "keep_cols += [ col_name for cfg_key, col_name in mapping.items() if u_cfg.get(cfg_key) ]\n", "\n", "# Filter the dataframe\n", "df_keep = df[keep_cols]\n", "\n", "# Place all string properties in another df\n", "df_str = df_keep[ ['codepoint'] + [x for x in str_props if x in keep_cols] ]\n", "df_int = df_keep[ ['codepoint'] + [x for x in int_props if x in keep_cols] ]\n", "df_bool = df_keep[ ['codepoint'] + [x for x in bool_props if x in keep_cols] ]\n", "\n", "# Convert boolean-like columns to actual bools or int8 to save space\n", "for k in bool_props:\n", " if k in df_bool.columns:\n", " # Convert to bool...\n", " df_bool[k] = df_bool[k].fillna(False).astype(bool)\n", "\n", "\n", "# Remove all rows which are completely false (boolean rows)\n", "feature_cols = [col for col in df_bool.columns if col != 'codepoint']\n", "df_bool = df_bool[(~(df_bool[feature_cols].astype(bool) == False).all(axis=1))]\n", "\n", "feature_cols = [col for col in df_int.columns if col != 'codepoint']\n", "df_int = df_int[(~(df_int[feature_cols].astype(int) == 0).all(axis=1))]\n", "\n", "feature_cols = [col for col in df_str.columns if col != 'codepoint']\n", "df_str = df_str[(~(df_str[feature_cols] == \"\").all(axis=1))]\n", "\n", "# cleanup all 3\n", "df_bool = df_bool.sort_values('codepoint').reset_index(drop=True)\n", "df_int = df_int .sort_values('codepoint').reset_index(drop=True)\n", "df_str = df_str .sort_values('codepoint').reset_index(drop=True)\n", "\n", "# [DELTA ENCODING]\n", "def deltaEncode(df:pd.DataFrame, cols:list[str]):\n", " changed = (df[cols] != df[cols].shift()).any(axis=1)\n", " # Identify where the codepoint sequence breaks (gap > 1)\n", " sequence_broken = (df['codepoint'].diff() > 1)\n", " # A new range starts if metadata changed OR the sequence broke\n", " range_id = (changed | sequence_broken).cumsum()\n", " # 4. Group by the range_id and aggregate\n", " df = df.groupby(range_id).agg(\n", " cp_start=('codepoint', 'min'),\n", " cp_end=('codepoint', 'max'),\n", " **{col: (col, 'first') for col in cols}\n", " )\n", " df = df.reset_index(drop=True)\n", " return df\n", "\n", "# [RUN LENGTH ENCODING]\n", "def runLengthEncoding(df):\n", " deltas = df.diff().fillna(df).astype(int)\n", " change_mask = (deltas != deltas.shift()).any(axis=1)\n", " group_ids = change_mask.cumsum()\n", " encoded_df = deltas.groupby(group_ids).first()\n", " encoded_df['repeat'] = deltas.groupby(group_ids).size() - 1\n", " return encoded_df.reset_index(drop=True)\n", "\n", "# [BOOLEAN OPTIMIZATION]\n", "df_bool = deltaEncode(df_bool, [c for c in df_bool.columns if c != 'codepoint'])\n", "\n", "# [INT OPTIMIZATION]\n", "for k in ['uppercase_map', 'lowercase_map', 'titlecase_map']:\n", " if k not in df_int.columns: continue # vvv make mapping relative vvv\n", " df_int[k] = df_int.apply(lambda x : x[k] - x['codepoint'] if x[k] != 0 else 0, axis=1)\n", "\n", "df_int = deltaEncode(df_int, [c for c in df_int.columns if c != 'codepoint'])\n", "df_int = runLengthEncoding(df_int)\n", "\n", "display(df_bool)\n", "display(df_int)\n", "display(df_str)" ] }, { "cell_type": "code", "execution_count": 25, "id": "3959426e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Size of boolean dataframe: 8667'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Size of int dataframe: 36336'" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# TEST BINARY SIZE #\n", "codepoint_size = 4\n", "flag_size = math.ceil(1./8. * (len(df_bool.columns) - 2))\n", "\n", "display(f\"Size of boolean dataframe: {(2 * codepoint_size + flag_size) * len(df_bool)}\")\n", "display(f\"Size of int dataframe: {(len(df_int.columns) * codepoint_size) * len(df_int)}\")\n", "# TODO: size of string dataframe" ] }, { "cell_type": "code", "execution_count": null, "id": "a97b59a9", "metadata": {}, "outputs": [], "source": [ "# EXPORT UNICODE #\n", "OUT_DIR = './out'\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e75c5b2f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1d0ba29e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.3" } }, "nbformat": 4, "nbformat_minor": 5 }