Files
x-icu/py-gen.ipynb
2026-02-27 22:34:15 -06:00

1582 lines
55 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"id": "4502a404",
"metadata": {},
"source": [
"# x-icu\n",
"> **The lightweight bridge to ICU data.**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "2baaf74f",
"metadata": {},
"outputs": [],
"source": [
"# Add libraries...\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import urllib.request\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "41361ca4",
"metadata": {},
"outputs": [],
"source": [
"# Configure what to get\n",
"CONFIG = {\n",
" 'sources':{\n",
" 'unicode':'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt',\n",
" },\n",
" 'unicode':{\n",
" # Unicode 1.0 name, considered \"old\"\n",
" # Set to None to use the old name if the\n",
" # current name is not descriptive\n",
" # e.g. use \"START OF TEXT\" instead of \"<control>\"\n",
" 'useOldName':True,\n",
" 'getName':True, # Formal name of the codepoint\n",
"\n",
" # PROBABLY REMOVE # 'getCategory':False,\n",
" # TODO # 'getScript': True, # e.g., \"Cyrillic\"\n",
" # TODO # 'getBlock': True, # e.g., \"Mathematical Alphanumeric Symbols\"\n",
"\n",
" 'getDecomposition':False, # aka \"ascii-fy\"\n",
" 'getDecompositionType':False, # metadata about decomposition\n",
"\n",
" 'toLowercase':True,\n",
" 'toUppercase':True,\n",
" 'toTitlecase':False,\n",
"\n",
" # TODO: 'isEmoji': True, # Does it have an emoji presentation?\n",
" 'isPunctuation': True, # General Category starts with 'P'\n",
" 'isSymbol': False, # General Category starts with 'S' (Math, Currency)\n",
" 'isCombining': False, # Is it a mark/accent that needs a base letter?\n",
"\n",
" 'isPrintable':False,\n",
" 'isSpace':True,\n",
" 'isWhitespace':True,\n",
" 'isLetter':True,\n",
" 'isUppercase':False,\n",
" 'isLowercase':False,\n",
" 'isTitlecase':False,\n",
" 'isDeprecated':False,\n",
"\n",
" # NOTE: The following fields work in a cascade,\n",
" # so if one field is true, the next one is also true.\n",
" # This is defined by unicode, we don't do any extra processing.\n",
" 'isDecimal':True, # An actual number 0-9, in different languages\n",
" 'isDigit':False, # Digits not used in standard positional notation,\n",
" # like superscripts or circled numbers.\n",
" # Usually used by search engines.\n",
" 'isNumberLike':False, # Neither of the previous two, but number like in nature, like 3/4\n",
" }\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "94696c64",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# Decoders the UnicodeData.txt data\n",
"class UnicodeDataParser:\n",
"\n",
" def __init__(self):\n",
" self.buffer = []\n",
" self.range_start_row = None\n",
" self.keys = [\n",
" \"codepoint\", \"name\", \"category\", \"combining_class\", \"bidi_class\",\n",
" \"decomposition\", \"decimal_val\", \"digit_val\", \"numeric_val\",\n",
" \"bidi_mirrored\", \"unicode_1_name\", \"iso_comment\", \n",
" \"uppercase_map\", \"lowercase_map\", \"titlecase_map\"\n",
" ]\n",
"\n",
" def parse_line(self, text_line: str):\n",
" if not text_line.strip():\n",
" return\n",
" \n",
" parts = text_line.strip().split(';')\n",
" # Ensure we have 15 columns as per spec\n",
" if len(parts) < 15:\n",
" return\n",
" \n",
" row = dict(zip(self.keys, parts))\n",
" row['codepoint'] = int(row['codepoint'], 16)\n",
" name = row['name']\n",
"\n",
" # Detect Range Start: e.g., \"<CJK Ideograph, First>\"\n",
" if name.endswith(', First>'):\n",
" self.range_start_row = row\n",
" \n",
" # Detect Range End: e.g., \"<CJK Ideograph, Last>\"\n",
" elif name.endswith(', Last>') and self.range_start_row:\n",
" self._fill_range(row)\n",
" self.range_start_row = None\n",
" \n",
" # Standard single codepoint\n",
" else:\n",
" self.buffer.append( self.normalize(row) )\n",
"\n",
" def _fill_range(self, end_row):\n",
" # Linearly interpolates all codepoints between First and Last.\n",
" start_hex = self.range_start_row['codepoint']\n",
" end_hex = end_row['codepoint']\n",
" \n",
" # Generic name for the range (stripping the \", First>\" part)\n",
" base_name = self.range_start_row['name'].replace(', First>', '').replace('<', '')\n",
"\n",
" # Loop this range then\n",
" for cp in range(start_hex, end_hex + 1):\n",
" new_row = self.range_start_row.copy()\n",
" new_row['codepoint'] = cp\n",
" new_row['name'] = f\"<{base_name}>\"\n",
" self.buffer.append( self.normalize(new_row) )\n",
"\n",
" def normalize(self, in_row:dict):\n",
" # Now, parse the fields individually\n",
" out_row = {}\n",
"\n",
" # Parse codepoint\n",
" out_row['codepoint'] = in_row['codepoint']\n",
" char = chr(out_row['codepoint'])\n",
"\n",
" out_row['name'] = in_row['name']\n",
" out_row['old_name'] = in_row['unicode_1_name']\n",
"\n",
" out_row['decomposition'] = in_row['decomposition']\n",
"\n",
" # Extracts text inside <> like <compat> or <circle>\n",
" decomp = in_row['decomposition']\n",
" out_row['decomposition_type'] = decomp[1:decomp.find('>')] if '<' in decomp else None\n",
"\n",
" category = in_row['category']\n",
" out_row['punctuation'] = category.startswith('P') \n",
" out_row['symbol'] = category.startswith('S')\n",
" out_row['combining'] = category.startswith('M') # Mark category\n",
" out_row['letter'] = category.startswith('L')\n",
"\n",
" out_row['uppercase'] = category == 'Lu'\n",
" out_row['lowercase'] = category == 'Ll'\n",
" out_row['titlecase'] = category == 'Lt'\n",
"\n",
" # Mappings\n",
" for k in [\"uppercase_map\", \"lowercase_map\", \"titlecase_map\"]:\n",
" out_row[k] = int(in_row[k], 16) if in_row[k] != '' else 0\n",
"\n",
" # Zs = Space Separator, but also check common control whitespaces\n",
" out_row['whitespace'] = category == 'Zs' or char in '\\t\\n\\r\\f\\v'\n",
"\n",
" # Non-printable are usually Control (C) and some Separator (Z) categories\n",
" out_row['printable'] = not category.startswith('C')\n",
"\n",
" # Decimal (0-9) -> Digit (Superscripts) -> NumberLike (Fractions/Roman)\n",
" is_decimal = bool(in_row['decimal_val'])\n",
" is_digit = is_decimal or bool(in_row['digit_val'])\n",
" is_numeric = is_digit or bool(in_row['numeric_val'])\n",
"\n",
" out_row['decimal'] = is_decimal\n",
" out_row['digit'] = is_digit\n",
" out_row['number_like'] = is_numeric\n",
"\n",
" # return the data\n",
" return out_row\n",
"\n",
" def get_dataframe(self):\n",
" df = pd.DataFrame(self.buffer)\n",
" self.buffer.clear()\n",
" return df\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "cfb55a58",
"metadata": {},
"outputs": [],
"source": [
"# Download the data\n",
"DATA_DIR = './data'\n",
"os.makedirs(DATA_DIR, exist_ok=True)\n",
"\n",
"def downloadFile(key:str, url:str):\n",
" urllib.request.urlretrieve(url, f'{DATA_DIR}/{key}.bin')\n",
"\n",
"for k in CONFIG['sources']:\n",
" downloadFile(k, CONFIG['sources'][k])\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "7dac9d1f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>codepoint</th>\n",
" <th>name</th>\n",
" <th>old_name</th>\n",
" <th>decomposition</th>\n",
" <th>decomposition_type</th>\n",
" <th>punctuation</th>\n",
" <th>symbol</th>\n",
" <th>combining</th>\n",
" <th>letter</th>\n",
" <th>uppercase</th>\n",
" <th>lowercase</th>\n",
" <th>titlecase</th>\n",
" <th>uppercase_map</th>\n",
" <th>lowercase_map</th>\n",
" <th>titlecase_map</th>\n",
" <th>whitespace</th>\n",
" <th>printable</th>\n",
" <th>decimal</th>\n",
" <th>digit</th>\n",
" <th>number_like</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>&lt;control&gt;</td>\n",
" <td>NULL</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>&lt;control&gt;</td>\n",
" <td>START OF HEADING</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>&lt;control&gt;</td>\n",
" <td>START OF TEXT</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>&lt;control&gt;</td>\n",
" <td>END OF TEXT</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>&lt;control&gt;</td>\n",
" <td>END OF TRANSMISSION</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>299377</th>\n",
" <td>1114105</td>\n",
" <td>&lt;Plane 16 Private Use&gt;</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>299378</th>\n",
" <td>1114106</td>\n",
" <td>&lt;Plane 16 Private Use&gt;</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>299379</th>\n",
" <td>1114107</td>\n",
" <td>&lt;Plane 16 Private Use&gt;</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>299380</th>\n",
" <td>1114108</td>\n",
" <td>&lt;Plane 16 Private Use&gt;</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>299381</th>\n",
" <td>1114109</td>\n",
" <td>&lt;Plane 16 Private Use&gt;</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>299382 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" codepoint name old_name decomposition \\\n",
"0 0 <control> NULL \n",
"1 1 <control> START OF HEADING \n",
"2 2 <control> START OF TEXT \n",
"3 3 <control> END OF TEXT \n",
"4 4 <control> END OF TRANSMISSION \n",
"... ... ... ... ... \n",
"299377 1114105 <Plane 16 Private Use> \n",
"299378 1114106 <Plane 16 Private Use> \n",
"299379 1114107 <Plane 16 Private Use> \n",
"299380 1114108 <Plane 16 Private Use> \n",
"299381 1114109 <Plane 16 Private Use> \n",
"\n",
" decomposition_type punctuation symbol combining letter uppercase \\\n",
"0 NaN False False False False False \n",
"1 NaN False False False False False \n",
"2 NaN False False False False False \n",
"3 NaN False False False False False \n",
"4 NaN False False False False False \n",
"... ... ... ... ... ... ... \n",
"299377 NaN False False False False False \n",
"299378 NaN False False False False False \n",
"299379 NaN False False False False False \n",
"299380 NaN False False False False False \n",
"299381 NaN False False False False False \n",
"\n",
" lowercase titlecase uppercase_map lowercase_map titlecase_map \\\n",
"0 False False 0 0 0 \n",
"1 False False 0 0 0 \n",
"2 False False 0 0 0 \n",
"3 False False 0 0 0 \n",
"4 False False 0 0 0 \n",
"... ... ... ... ... ... \n",
"299377 False False 0 0 0 \n",
"299378 False False 0 0 0 \n",
"299379 False False 0 0 0 \n",
"299380 False False 0 0 0 \n",
"299381 False False 0 0 0 \n",
"\n",
" whitespace printable decimal digit number_like \n",
"0 False False False False False \n",
"1 False False False False False \n",
"2 False False False False False \n",
"3 False False False False False \n",
"4 False False False False False \n",
"... ... ... ... ... ... \n",
"299377 False False False False False \n",
"299378 False False False False False \n",
"299379 False False False False False \n",
"299380 False False False False False \n",
"299381 False False False False False \n",
"\n",
"[299382 rows x 20 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now, for each contruct of the configuration, parse the file\n",
"# vvv UNICODE vvv #\n",
"unicode = UnicodeDataParser()\n",
"with open(f'{DATA_DIR}/unicode.bin', 'r') as file:\n",
" for line in file:\n",
" unicode.parse_line(line)\n",
"\n",
"df = unicode.get_dataframe()\n",
"df\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2029f383",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>codepoint</th>\n",
" <th>name</th>\n",
" <th>old_name</th>\n",
" <th>decomposition</th>\n",
" <th>decomposition_type</th>\n",
" <th>punctuation</th>\n",
" <th>symbol</th>\n",
" <th>combining</th>\n",
" <th>letter</th>\n",
" <th>uppercase</th>\n",
" <th>lowercase</th>\n",
" <th>titlecase</th>\n",
" <th>uppercase_map</th>\n",
" <th>lowercase_map</th>\n",
" <th>titlecase_map</th>\n",
" <th>whitespace</th>\n",
" <th>printable</th>\n",
" <th>decimal</th>\n",
" <th>digit</th>\n",
" <th>number_like</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>160</td>\n",
" <td>NO-BREAK SPACE</td>\n",
" <td>NON-BREAKING SPACE</td>\n",
" <td>&lt;noBreak&gt; 0020</td>\n",
" <td>noBreak</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>168</th>\n",
" <td>168</td>\n",
" <td>DIAERESIS</td>\n",
" <td>SPACING DIAERESIS</td>\n",
" <td>&lt;compat&gt; 0020 0308</td>\n",
" <td>compat</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>170</th>\n",
" <td>170</td>\n",
" <td>FEMININE ORDINAL INDICATOR</td>\n",
" <td></td>\n",
" <td>&lt;super&gt; 0061</td>\n",
" <td>super</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>175</th>\n",
" <td>175</td>\n",
" <td>MACRON</td>\n",
" <td>SPACING MACRON</td>\n",
" <td>&lt;compat&gt; 0020 0304</td>\n",
" <td>compat</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>178</th>\n",
" <td>178</td>\n",
" <td>SUPERSCRIPT TWO</td>\n",
" <td>SUPERSCRIPT DIGIT TWO</td>\n",
" <td>&lt;super&gt; 0032</td>\n",
" <td>super</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93029</th>\n",
" <td>130037</td>\n",
" <td>SEGMENTED DIGIT FIVE</td>\n",
" <td></td>\n",
" <td>&lt;font&gt; 0035</td>\n",
" <td>font</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93030</th>\n",
" <td>130038</td>\n",
" <td>SEGMENTED DIGIT SIX</td>\n",
" <td></td>\n",
" <td>&lt;font&gt; 0036</td>\n",
" <td>font</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93031</th>\n",
" <td>130039</td>\n",
" <td>SEGMENTED DIGIT SEVEN</td>\n",
" <td></td>\n",
" <td>&lt;font&gt; 0037</td>\n",
" <td>font</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93032</th>\n",
" <td>130040</td>\n",
" <td>SEGMENTED DIGIT EIGHT</td>\n",
" <td></td>\n",
" <td>&lt;font&gt; 0038</td>\n",
" <td>font</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93033</th>\n",
" <td>130041</td>\n",
" <td>SEGMENTED DIGIT NINE</td>\n",
" <td></td>\n",
" <td>&lt;font&gt; 0039</td>\n",
" <td>font</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3833 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" codepoint name old_name \\\n",
"160 160 NO-BREAK SPACE NON-BREAKING SPACE \n",
"168 168 DIAERESIS SPACING DIAERESIS \n",
"170 170 FEMININE ORDINAL INDICATOR \n",
"175 175 MACRON SPACING MACRON \n",
"178 178 SUPERSCRIPT TWO SUPERSCRIPT DIGIT TWO \n",
"... ... ... ... \n",
"93029 130037 SEGMENTED DIGIT FIVE \n",
"93030 130038 SEGMENTED DIGIT SIX \n",
"93031 130039 SEGMENTED DIGIT SEVEN \n",
"93032 130040 SEGMENTED DIGIT EIGHT \n",
"93033 130041 SEGMENTED DIGIT NINE \n",
"\n",
" decomposition decomposition_type punctuation symbol combining \\\n",
"160 <noBreak> 0020 noBreak False False False \n",
"168 <compat> 0020 0308 compat False True False \n",
"170 <super> 0061 super False False False \n",
"175 <compat> 0020 0304 compat False True False \n",
"178 <super> 0032 super False False False \n",
"... ... ... ... ... ... \n",
"93029 <font> 0035 font False False False \n",
"93030 <font> 0036 font False False False \n",
"93031 <font> 0037 font False False False \n",
"93032 <font> 0038 font False False False \n",
"93033 <font> 0039 font False False False \n",
"\n",
" letter uppercase lowercase titlecase uppercase_map lowercase_map \\\n",
"160 False False False False 0 0 \n",
"168 False False False False 0 0 \n",
"170 True False False False 0 0 \n",
"175 False False False False 0 0 \n",
"178 False False False False 0 0 \n",
"... ... ... ... ... ... ... \n",
"93029 False False False False 0 0 \n",
"93030 False False False False 0 0 \n",
"93031 False False False False 0 0 \n",
"93032 False False False False 0 0 \n",
"93033 False False False False 0 0 \n",
"\n",
" titlecase_map whitespace printable decimal digit number_like \n",
"160 0 True True False False False \n",
"168 0 False True False False False \n",
"170 0 False True False False False \n",
"175 0 False True False False False \n",
"178 0 False True False True True \n",
"... ... ... ... ... ... ... \n",
"93029 0 False True True True True \n",
"93030 0 False True True True True \n",
"93031 0 False True True True True \n",
"93032 0 False True True True True \n",
"93033 0 False True True True True \n",
"\n",
"[3833 rows x 20 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# TEST #\n",
"df[~df['decomposition_type'].isna()]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "d29860e8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cp_start</th>\n",
" <th>cp_end</th>\n",
" <th>punctuation</th>\n",
" <th>letter</th>\n",
" <th>whitespace</th>\n",
" <th>decimal</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9</td>\n",
" <td>13</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>32</td>\n",
" <td>32</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>33</td>\n",
" <td>35</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>37</td>\n",
" <td>42</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>44</td>\n",
" <td>47</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>958</th>\n",
" <td>183984</td>\n",
" <td>191456</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>959</th>\n",
" <td>191472</td>\n",
" <td>192093</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>960</th>\n",
" <td>194560</td>\n",
" <td>195101</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>961</th>\n",
" <td>196608</td>\n",
" <td>201546</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>962</th>\n",
" <td>201552</td>\n",
" <td>210041</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>963 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" cp_start cp_end punctuation letter whitespace decimal\n",
"0 9 13 False False True False\n",
"1 32 32 False False True False\n",
"2 33 35 True False False False\n",
"3 37 42 True False False False\n",
"4 44 47 True False False False\n",
".. ... ... ... ... ... ...\n",
"958 183984 191456 False True False False\n",
"959 191472 192093 False True False False\n",
"960 194560 195101 False True False False\n",
"961 196608 201546 False True False False\n",
"962 201552 210041 False True False False\n",
"\n",
"[963 rows x 6 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>codepoint</th>\n",
" <th>uppercase_map</th>\n",
" <th>lowercase_map</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>65</td>\n",
" <td>0</td>\n",
" <td>32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>66</td>\n",
" <td>0</td>\n",
" <td>32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>67</td>\n",
" <td>0</td>\n",
" <td>32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>68</td>\n",
" <td>0</td>\n",
" <td>32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>69</td>\n",
" <td>0</td>\n",
" <td>32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2984</th>\n",
" <td>125247</td>\n",
" <td>-34</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2985</th>\n",
" <td>125248</td>\n",
" <td>-34</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2986</th>\n",
" <td>125249</td>\n",
" <td>-34</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2987</th>\n",
" <td>125250</td>\n",
" <td>-34</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2988</th>\n",
" <td>125251</td>\n",
" <td>-34</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2989 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" codepoint uppercase_map lowercase_map\n",
"0 65 0 32\n",
"1 66 0 32\n",
"2 67 0 32\n",
"3 68 0 32\n",
"4 69 0 32\n",
"... ... ... ...\n",
"2984 125247 -34 0\n",
"2985 125248 -34 0\n",
"2986 125249 -34 0\n",
"2987 125250 -34 0\n",
"2988 125251 -34 0\n",
"\n",
"[2989 rows x 3 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>codepoint</th>\n",
" <th>name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>NULL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>START OF HEADING</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>START OF TEXT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>END OF TEXT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>END OF TRANSMISSION</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>299377</th>\n",
" <td>1114105</td>\n",
" <td>&lt;Plane 16 Private Use&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>299378</th>\n",
" <td>1114106</td>\n",
" <td>&lt;Plane 16 Private Use&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>299379</th>\n",
" <td>1114107</td>\n",
" <td>&lt;Plane 16 Private Use&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>299380</th>\n",
" <td>1114108</td>\n",
" <td>&lt;Plane 16 Private Use&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>299381</th>\n",
" <td>1114109</td>\n",
" <td>&lt;Plane 16 Private Use&gt;</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>299382 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" codepoint name\n",
"0 0 NULL\n",
"1 1 START OF HEADING\n",
"2 2 START OF TEXT\n",
"3 3 END OF TEXT\n",
"4 4 END OF TRANSMISSION\n",
"... ... ...\n",
"299377 1114105 <Plane 16 Private Use>\n",
"299378 1114106 <Plane 16 Private Use>\n",
"299379 1114107 <Plane 16 Private Use>\n",
"299380 1114108 <Plane 16 Private Use>\n",
"299381 1114109 <Plane 16 Private Use>\n",
"\n",
"[299382 rows x 2 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# UNICODE >> Apply config\n",
"u_cfg = CONFIG['unicode']\n",
"\n",
"# If useOldName is True, replace <control> or empty names with old_name\n",
"if u_cfg.get('useOldName'):\n",
" mask = (df['name'].str.startswith('<')) & (df['old_name'].str.len() > 0)\n",
" df.loc[mask, 'name'] = df.loc[mask, 'old_name']\n",
"\n",
"# Map CONFIG keys to actual DataFrame column names\n",
"mapping = {\n",
" 'getName': 'name',\n",
" 'getDecomposition': 'decomposition',\n",
" 'getDecompositionType': 'decomposition_type',\n",
" 'isPunctuation': 'punctuation',\n",
" 'isSymbol': 'symbol',\n",
" 'isCombining': 'combining',\n",
" 'isLetter': 'letter',\n",
" 'isUppercase': 'uppercase',\n",
" 'isLowercase': 'lowercase',\n",
" 'isTitlecase': 'titlecase',\n",
" 'toUppercase': 'uppercase_map',\n",
" 'toLowercase': 'lowercase_map',\n",
" 'toTitlecase': 'titlecase_map',\n",
" 'isWhitespace': 'whitespace',\n",
" 'isPrintable': 'printable',\n",
" 'isDecimal': 'decimal',\n",
" 'isDigit': 'digit',\n",
" 'isNumberLike': 'number_like'\n",
"}\n",
"\n",
"# Categorize these keys\n",
"str_props = ['name','decomposition']\n",
"bool_props = [ y for x, y in mapping.items() if x.startswith('is') ]\n",
"int_props = [ 'decomposition_type', 'uppercase_map', 'lowercase_map', 'titlecase_map' ]\n",
"\n",
"keep_cols = ['codepoint'] # Always keep codepoint\n",
"keep_cols += [ col_name for cfg_key, col_name in mapping.items() if u_cfg.get(cfg_key) ]\n",
"\n",
"# Filter the dataframe\n",
"df_keep = df[keep_cols]\n",
"\n",
"# Place all string properties in another df\n",
"df_str = df_keep[ ['codepoint'] + [x for x in str_props if x in keep_cols] ]\n",
"df_int = df_keep[ ['codepoint'] + [x for x in int_props if x in keep_cols] ]\n",
"df_bool = df_keep[ ['codepoint'] + [x for x in bool_props if x in keep_cols] ]\n",
"\n",
"# Convert boolean-like columns to actual bools or int8 to save space\n",
"for k in bool_props:\n",
" if k in df_bool.columns:\n",
" # Convert to bool...\n",
" df_bool[k] = df_bool[k].fillna(False).astype(bool)\n",
"\n",
"\n",
"# Remove all rows which are completely false (boolean rows)\n",
"feature_cols = [col for col in df_bool.columns if col != 'codepoint']\n",
"df_bool = df_bool[(~(df_bool[feature_cols].astype(bool) == False).all(axis=1))]\n",
"\n",
"feature_cols = [col for col in df_int.columns if col != 'codepoint']\n",
"df_int = df_int[(~(df_int[feature_cols].astype(int) == 0).all(axis=1))]\n",
"\n",
"feature_cols = [col for col in df_str.columns if col != 'codepoint']\n",
"df_str = df_str[(~(df_str[feature_cols] == \"\").all(axis=1))]\n",
"\n",
"# cleanup all 3\n",
"df_bool = df_bool.sort_values('codepoint').reset_index(drop=True)\n",
"df_int = df_int .sort_values('codepoint').reset_index(drop=True)\n",
"df_str = df_str .sort_values('codepoint').reset_index(drop=True)\n",
"\n",
"# [BOOLEAN OPTIMIZATION]\n",
"# Identify where boolean changes (comparing current row to previous)\n",
"# We exclude the 'codepoint' from this comparison\n",
"metadata_cols = [c for c in df_bool.columns if c != 'codepoint']\n",
"metadata_changed = (df_bool[metadata_cols] != df_bool[metadata_cols].shift()).any(axis=1)\n",
"# Identify where the codepoint sequence breaks (gap > 1)\n",
"sequence_broken = (df_bool['codepoint'].diff() > 1)\n",
"# A new range starts if metadata changed OR the sequence broke\n",
"range_id = (metadata_changed | sequence_broken).cumsum()\n",
"# 4. Group by the range_id and aggregate\n",
"df_bool = df_bool.groupby(range_id).agg(\n",
" cp_start=('codepoint', 'min'),\n",
" cp_end=('codepoint', 'max'),\n",
" **{col: (col, 'first') for col in metadata_cols}\n",
")\n",
"df_bool = df_bool.reset_index(drop=True)\n",
"\n",
"# [INT OPTIMIZATION]\n",
"for k in ['uppercase_map', 'lowercase_map', 'titlecase_map']:\n",
" if k not in df_int.columns: continue\n",
" df_int[k] = df_int.apply(lambda x : x[k] - x['codepoint'] if x[k] != 0 else 0, axis=1)\n",
"\n",
"display(df_bool)\n",
"display(df_int)\n",
"display(df_str)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "73cf0f32",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"False"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# TEST #\n",
"display(df_str.empty)\n",
"display(df_bool.empty)\n",
"display(df_int.empty)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a0085c0",
"metadata": {},
"outputs": [],
"source": [
"# Format of binaries\n",
"# \"X-ICU\"\n",
"# <Same Byte Size:16 -- 0 if not same size>\n",
"# <Amount of ranges>\n",
"# <cp0><cp1><byte size of block:8 -- not present if same size>"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a97b59a9",
"metadata": {},
"outputs": [],
"source": [
"# EXPORT UNICODE #\n",
"OUT_DIR = './out'\n",
"str_cols = [ 'name', 'decomposition' ]\n",
"cp_size = 4\n",
"same_size = CONFIG['general']['zeroPadStrings'] \\\n",
" or CONFIG['general']['exportStringsInDedicatedFile'] \\\n",
" or set(str_cols).isdisjoint(df.columns)\n",
"\n",
"\n",
"\n",
"# Select properties which are, and their size\n",
"for cfg_key, col_name in mapping.items():\n",
" if cfg_key in df.columns and cfg_key.startswith('is'):"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e75c5b2f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1d0ba29e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}