diff --git a/py-gen.ipynb b/py-gen.ipynb
index e55029a..f34c779 100644
--- a/py-gen.ipynb
+++ b/py-gen.ipynb
@@ -11,7 +11,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 19,
"id": "2baaf74f",
"metadata": {},
"outputs": [],
@@ -20,12 +20,13 @@
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
- "import urllib.request\n"
+ "import urllib.request\n",
+ "import math"
]
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 2,
"id": "41361ca4",
"metadata": {},
"outputs": [],
@@ -76,6 +77,11 @@
" # like superscripts or circled numbers.\n",
" # Usually used by search engines.\n",
" 'isNumberLike':False, # Neither of the previous two, but number like in nature, like 3/4\n",
+ "\n",
+ " # These getters will get a numeric value out of the char\n",
+ " 'getDecimal': True,\n",
+ " 'getDigit': False,\n",
+ " 'getNumberLike': False,\n",
" }\n",
"}\n"
]
@@ -189,6 +195,10 @@
" out_row['digit'] = is_digit\n",
" out_row['number_like'] = is_numeric\n",
"\n",
+ " out_row['decimal_val'] = int(in_row['decimal_val']) if is_decimal else 0\n",
+ " out_row['digit_val'] = int(in_row['digit_val']) if is_digit else 0\n",
+ " out_row['number_like_val'] = in_row['numeric_val'] if is_numeric else None\n",
+ "\n",
" # return the data\n",
" return out_row\n",
"\n",
@@ -253,9 +263,7 @@
"
combining | \n",
" letter | \n",
" uppercase | \n",
- " lowercase | \n",
- " titlecase | \n",
- " uppercase_map | \n",
+ " ... | \n",
" lowercase_map | \n",
" titlecase_map | \n",
" whitespace | \n",
@@ -263,6 +271,9 @@
" decimal | \n",
" digit | \n",
" number_like | \n",
+ " decimal_val | \n",
+ " digit_val | \n",
+ " number_like_val | \n",
" \n",
" \n",
" \n",
@@ -278,16 +289,17 @@
" False | \n",
" False | \n",
" False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" False | \n",
" False | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " NaN | \n",
" \n",
" \n",
" | 1 | \n",
@@ -301,16 +313,17 @@
" False | \n",
" False | \n",
" False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" False | \n",
" False | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " NaN | \n",
"
\n",
" \n",
" | 2 | \n",
@@ -324,16 +337,17 @@
" False | \n",
" False | \n",
" False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" False | \n",
" False | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " NaN | \n",
"
\n",
" \n",
" | 3 | \n",
@@ -347,16 +361,17 @@
" False | \n",
" False | \n",
" False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" False | \n",
" False | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " NaN | \n",
"
\n",
" \n",
" | 4 | \n",
@@ -370,16 +385,17 @@
" False | \n",
" False | \n",
" False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" False | \n",
" False | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " NaN | \n",
"
\n",
" \n",
" | ... | \n",
@@ -403,6 +419,7 @@
" ... | \n",
" ... | \n",
" ... | \n",
+ " ... | \n",
"
\n",
" \n",
" | 299377 | \n",
@@ -416,16 +433,17 @@
" False | \n",
" False | \n",
" False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" False | \n",
" False | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " NaN | \n",
"
\n",
" \n",
" | 299378 | \n",
@@ -439,16 +457,17 @@
" False | \n",
" False | \n",
" False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" False | \n",
" False | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " NaN | \n",
"
\n",
" \n",
" | 299379 | \n",
@@ -462,16 +481,17 @@
" False | \n",
" False | \n",
" False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" False | \n",
" False | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " NaN | \n",
"
\n",
" \n",
" | 299380 | \n",
@@ -485,16 +505,17 @@
" False | \n",
" False | \n",
" False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" False | \n",
" False | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " NaN | \n",
"
\n",
" \n",
" | 299381 | \n",
@@ -508,20 +529,21 @@
" False | \n",
" False | \n",
" False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
" False | \n",
" False | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " NaN | \n",
"
\n",
" \n",
"\n",
- "299382 rows × 20 columns
\n",
+ "299382 rows × 23 columns
\n",
""
],
"text/plain": [
@@ -551,33 +573,33 @@
"299380 NaN False False False False False \n",
"299381 NaN False False False False False \n",
"\n",
- " lowercase titlecase uppercase_map lowercase_map titlecase_map \\\n",
- "0 False False 0 0 0 \n",
- "1 False False 0 0 0 \n",
- "2 False False 0 0 0 \n",
- "3 False False 0 0 0 \n",
- "4 False False 0 0 0 \n",
- "... ... ... ... ... ... \n",
- "299377 False False 0 0 0 \n",
- "299378 False False 0 0 0 \n",
- "299379 False False 0 0 0 \n",
- "299380 False False 0 0 0 \n",
- "299381 False False 0 0 0 \n",
+ " ... lowercase_map titlecase_map whitespace printable decimal \\\n",
+ "0 ... 0 0 False False False \n",
+ "1 ... 0 0 False False False \n",
+ "2 ... 0 0 False False False \n",
+ "3 ... 0 0 False False False \n",
+ "4 ... 0 0 False False False \n",
+ "... ... ... ... ... ... ... \n",
+ "299377 ... 0 0 False False False \n",
+ "299378 ... 0 0 False False False \n",
+ "299379 ... 0 0 False False False \n",
+ "299380 ... 0 0 False False False \n",
+ "299381 ... 0 0 False False False \n",
"\n",
- " whitespace printable decimal digit number_like \n",
- "0 False False False False False \n",
- "1 False False False False False \n",
- "2 False False False False False \n",
- "3 False False False False False \n",
- "4 False False False False False \n",
- "... ... ... ... ... ... \n",
- "299377 False False False False False \n",
- "299378 False False False False False \n",
- "299379 False False False False False \n",
- "299380 False False False False False \n",
- "299381 False False False False False \n",
+ " digit number_like decimal_val digit_val number_like_val \n",
+ "0 False False 0 0 NaN \n",
+ "1 False False 0 0 NaN \n",
+ "2 False False 0 0 NaN \n",
+ "3 False False 0 0 NaN \n",
+ "4 False False 0 0 NaN \n",
+ "... ... ... ... ... ... \n",
+ "299377 False False 0 0 NaN \n",
+ "299378 False False 0 0 NaN \n",
+ "299379 False False 0 0 NaN \n",
+ "299380 False False 0 0 NaN \n",
+ "299381 False False 0 0 NaN \n",
"\n",
- "[299382 rows x 20 columns]"
+ "[299382 rows x 23 columns]"
]
},
"execution_count": 5,
@@ -634,9 +656,7 @@
" combining | \n",
" letter | \n",
" uppercase | \n",
- " lowercase | \n",
- " titlecase | \n",
- " uppercase_map | \n",
+ " ... | \n",
" lowercase_map | \n",
" titlecase_map | \n",
" whitespace | \n",
@@ -644,123 +664,131 @@
" decimal | \n",
" digit | \n",
" number_like | \n",
+ " decimal_val | \n",
+ " digit_val | \n",
+ " number_like_val | \n",
" \n",
" \n",
" \n",
" \n",
- " | 160 | \n",
- " 160 | \n",
- " NO-BREAK SPACE | \n",
- " NON-BREAKING SPACE | \n",
- " <noBreak> 0020 | \n",
- " noBreak | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " True | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 168 | \n",
- " 168 | \n",
- " DIAERESIS | \n",
- " SPACING DIAERESIS | \n",
- " <compat> 0020 0308 | \n",
- " compat | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- "
\n",
- " \n",
- " | 170 | \n",
- " 170 | \n",
- " FEMININE ORDINAL INDICATOR | \n",
+ " 48 | \n",
+ " 48 | \n",
+ " DIGIT ZERO | \n",
" | \n",
- " <super> 0061 | \n",
- " super | \n",
+ " | \n",
+ " NaN | \n",
" False | \n",
" False | \n",
" False | \n",
- " True | \n",
" False | \n",
" False | \n",
- " False | \n",
- " 0 | \n",
+ " ... | \n",
" 0 | \n",
" 0 | \n",
" False | \n",
" True | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
"
\n",
" \n",
- " | 175 | \n",
- " 175 | \n",
- " MACRON | \n",
- " SPACING MACRON | \n",
- " <compat> 0020 0304 | \n",
- " compat | \n",
- " False | \n",
- " True | \n",
+ " 49 | \n",
+ " 49 | \n",
+ " DIGIT ONE | \n",
+ " | \n",
+ " | \n",
+ " NaN | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
- " 0 | \n",
+ " ... | \n",
" 0 | \n",
" 0 | \n",
" False | \n",
" True | \n",
- " False | \n",
- " False | \n",
- " False | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
"
\n",
" \n",
- " | 178 | \n",
- " 178 | \n",
- " SUPERSCRIPT TWO | \n",
- " SUPERSCRIPT DIGIT TWO | \n",
- " <super> 0032 | \n",
- " super | \n",
+ " 50 | \n",
+ " 50 | \n",
+ " DIGIT TWO | \n",
+ " | \n",
+ " | \n",
+ " NaN | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
- " False | \n",
- " False | \n",
- " 0 | \n",
+ " ... | \n",
" 0 | \n",
" 0 | \n",
" False | \n",
" True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 51 | \n",
+ " 51 | \n",
+ " DIGIT THREE | \n",
+ " | \n",
+ " | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
" False | \n",
" True | \n",
" True | \n",
+ " True | \n",
+ " True | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 52 | \n",
+ " 52 | \n",
+ " DIGIT FOUR | \n",
+ " | \n",
+ " | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 4 | \n",
"
\n",
" \n",
" | ... | \n",
@@ -784,6 +812,7 @@
" ... | \n",
" ... | \n",
" ... | \n",
+ " ... | \n",
"
\n",
" \n",
" | 93029 | \n",
@@ -797,9 +826,7 @@
" False | \n",
" False | \n",
" False | \n",
- " False | \n",
- " False | \n",
- " 0 | \n",
+ " ... | \n",
" 0 | \n",
" 0 | \n",
" False | \n",
@@ -807,6 +834,9 @@
" True | \n",
" True | \n",
" True | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 5 | \n",
"
\n",
" \n",
" | 93030 | \n",
@@ -820,9 +850,7 @@
" False | \n",
" False | \n",
" False | \n",
- " False | \n",
- " False | \n",
- " 0 | \n",
+ " ... | \n",
" 0 | \n",
" 0 | \n",
" False | \n",
@@ -830,6 +858,9 @@
" True | \n",
" True | \n",
" True | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
"
\n",
" \n",
" | 93031 | \n",
@@ -843,9 +874,7 @@
" False | \n",
" False | \n",
" False | \n",
- " False | \n",
- " False | \n",
- " 0 | \n",
+ " ... | \n",
" 0 | \n",
" 0 | \n",
" False | \n",
@@ -853,6 +882,9 @@
" True | \n",
" True | \n",
" True | \n",
+ " 7 | \n",
+ " 7 | \n",
+ " 7 | \n",
"
\n",
" \n",
" | 93032 | \n",
@@ -866,9 +898,7 @@
" False | \n",
" False | \n",
" False | \n",
- " False | \n",
- " False | \n",
- " 0 | \n",
+ " ... | \n",
" 0 | \n",
" 0 | \n",
" False | \n",
@@ -876,6 +906,9 @@
" True | \n",
" True | \n",
" True | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
"
\n",
" \n",
" | 93033 | \n",
@@ -889,9 +922,7 @@
" False | \n",
" False | \n",
" False | \n",
- " False | \n",
- " False | \n",
- " 0 | \n",
+ " ... | \n",
" 0 | \n",
" 0 | \n",
" False | \n",
@@ -899,66 +930,69 @@
" True | \n",
" True | \n",
" True | \n",
+ " 9 | \n",
+ " 9 | \n",
+ " 9 | \n",
"
\n",
" \n",
"\n",
- "3833 rows × 20 columns
\n",
+ "770 rows × 23 columns
\n",
""
],
"text/plain": [
- " codepoint name old_name \\\n",
- "160 160 NO-BREAK SPACE NON-BREAKING SPACE \n",
- "168 168 DIAERESIS SPACING DIAERESIS \n",
- "170 170 FEMININE ORDINAL INDICATOR \n",
- "175 175 MACRON SPACING MACRON \n",
- "178 178 SUPERSCRIPT TWO SUPERSCRIPT DIGIT TWO \n",
- "... ... ... ... \n",
- "93029 130037 SEGMENTED DIGIT FIVE \n",
- "93030 130038 SEGMENTED DIGIT SIX \n",
- "93031 130039 SEGMENTED DIGIT SEVEN \n",
- "93032 130040 SEGMENTED DIGIT EIGHT \n",
- "93033 130041 SEGMENTED DIGIT NINE \n",
+ " codepoint name old_name decomposition \\\n",
+ "48 48 DIGIT ZERO \n",
+ "49 49 DIGIT ONE \n",
+ "50 50 DIGIT TWO \n",
+ "51 51 DIGIT THREE \n",
+ "52 52 DIGIT FOUR \n",
+ "... ... ... ... ... \n",
+ "93029 130037 SEGMENTED DIGIT FIVE 0035 \n",
+ "93030 130038 SEGMENTED DIGIT SIX 0036 \n",
+ "93031 130039 SEGMENTED DIGIT SEVEN 0037 \n",
+ "93032 130040 SEGMENTED DIGIT EIGHT 0038 \n",
+ "93033 130041 SEGMENTED DIGIT NINE 0039 \n",
"\n",
- " decomposition decomposition_type punctuation symbol combining \\\n",
- "160 0020 noBreak False False False \n",
- "168 0020 0308 compat False True False \n",
- "170 0061 super False False False \n",
- "175 0020 0304 compat False True False \n",
- "178 0032 super False False False \n",
- "... ... ... ... ... ... \n",
- "93029 0035 font False False False \n",
- "93030 0036 font False False False \n",
- "93031 0037 font False False False \n",
- "93032 0038 font False False False \n",
- "93033 0039 font False False False \n",
+ " decomposition_type punctuation symbol combining letter uppercase \\\n",
+ "48 NaN False False False False False \n",
+ "49 NaN False False False False False \n",
+ "50 NaN False False False False False \n",
+ "51 NaN False False False False False \n",
+ "52 NaN False False False False False \n",
+ "... ... ... ... ... ... ... \n",
+ "93029 font False False False False False \n",
+ "93030 font False False False False False \n",
+ "93031 font False False False False False \n",
+ "93032 font False False False False False \n",
+ "93033 font False False False False False \n",
"\n",
- " letter uppercase lowercase titlecase uppercase_map lowercase_map \\\n",
- "160 False False False False 0 0 \n",
- "168 False False False False 0 0 \n",
- "170 True False False False 0 0 \n",
- "175 False False False False 0 0 \n",
- "178 False False False False 0 0 \n",
- "... ... ... ... ... ... ... \n",
- "93029 False False False False 0 0 \n",
- "93030 False False False False 0 0 \n",
- "93031 False False False False 0 0 \n",
- "93032 False False False False 0 0 \n",
- "93033 False False False False 0 0 \n",
+ " ... lowercase_map titlecase_map whitespace printable decimal \\\n",
+ "48 ... 0 0 False True True \n",
+ "49 ... 0 0 False True True \n",
+ "50 ... 0 0 False True True \n",
+ "51 ... 0 0 False True True \n",
+ "52 ... 0 0 False True True \n",
+ "... ... ... ... ... ... ... \n",
+ "93029 ... 0 0 False True True \n",
+ "93030 ... 0 0 False True True \n",
+ "93031 ... 0 0 False True True \n",
+ "93032 ... 0 0 False True True \n",
+ "93033 ... 0 0 False True True \n",
"\n",
- " titlecase_map whitespace printable decimal digit number_like \n",
- "160 0 True True False False False \n",
- "168 0 False True False False False \n",
- "170 0 False True False False False \n",
- "175 0 False True False False False \n",
- "178 0 False True False True True \n",
- "... ... ... ... ... ... ... \n",
- "93029 0 False True True True True \n",
- "93030 0 False True True True True \n",
- "93031 0 False True True True True \n",
- "93032 0 False True True True True \n",
- "93033 0 False True True True True \n",
+ " digit number_like decimal_val digit_val number_like_val \n",
+ "48 True True 0 0 0 \n",
+ "49 True True 1 1 1 \n",
+ "50 True True 2 2 2 \n",
+ "51 True True 3 3 3 \n",
+ "52 True True 4 4 4 \n",
+ "... ... ... ... ... ... \n",
+ "93029 True True 5 5 5 \n",
+ "93030 True True 6 6 6 \n",
+ "93031 True True 7 7 7 \n",
+ "93032 True True 8 8 8 \n",
+ "93033 True True 9 9 9 \n",
"\n",
- "[3833 rows x 20 columns]"
+ "[770 rows x 23 columns]"
]
},
"execution_count": 6,
@@ -968,12 +1002,12 @@
],
"source": [
"# TEST #\n",
- "df[~df['decomposition_type'].isna()]"
+ "df[df['decimal'] == True]"
]
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 18,
"id": "d29860e8",
"metadata": {},
"outputs": [
@@ -1152,98 +1186,134 @@
" \n",
" \n",
" | \n",
- " codepoint | \n",
+ " cp_start | \n",
+ " cp_end | \n",
" uppercase_map | \n",
" lowercase_map | \n",
+ " decimal_val | \n",
+ " repeat | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
- " 65 | \n",
+ " 49 | \n",
+ " 49 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
" 0 | \n",
- " 32 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 66 | \n",
+ " 1 | \n",
+ " 1 | \n",
" 0 | \n",
- " 32 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 7 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 67 | \n",
+ " 8 | \n",
+ " 33 | \n",
" 0 | \n",
" 32 | \n",
+ " -9 | \n",
+ " 0 | \n",
"
\n",
" \n",
" | 3 | \n",
- " 68 | \n",
- " 0 | \n",
" 32 | \n",
+ " 32 | \n",
+ " -32 | \n",
+ " -32 | \n",
+ " 0 | \n",
+ " 0 | \n",
"
\n",
" \n",
" | 4 | \n",
- " 69 | \n",
+ " 84 | \n",
+ " 59 | \n",
+ " 775 | \n",
+ " 0 | \n",
+ " 0 | \n",
" 0 | \n",
- " 32 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
"
\n",
" \n",
- " | 2984 | \n",
- " 125247 | \n",
+ " 1509 | \n",
+ " 34 | \n",
+ " 34 | \n",
" -34 | \n",
+ " -34 | \n",
+ " 0 | \n",
" 0 | \n",
"
\n",
" \n",
- " | 2985 | \n",
- " 125248 | \n",
- " -34 | \n",
+ " 1510 | \n",
+ " 47 | \n",
+ " 14 | \n",
+ " 34 | \n",
+ " 0 | \n",
+ " 1 | \n",
" 0 | \n",
"
\n",
" \n",
- " | 2986 | \n",
- " 125249 | \n",
- " -34 | \n",
+ " 1511 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | 1512 | \n",
+ " 4760 | \n",
+ " 4760 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " -8 | \n",
" 0 | \n",
"
\n",
" \n",
- " | 2987 | \n",
- " 125250 | \n",
- " -34 | \n",
+ " 1513 | \n",
+ " 1 | \n",
+ " 1 | \n",
" 0 | \n",
- "
\n",
- " \n",
- " | 2988 | \n",
- " 125251 | \n",
- " -34 | \n",
" 0 | \n",
+ " 1 | \n",
+ " 7 | \n",
"
\n",
" \n",
"\n",
- "2989 rows × 3 columns
\n",
+ "1514 rows × 6 columns
\n",
""
],
"text/plain": [
- " codepoint uppercase_map lowercase_map\n",
- "0 65 0 32\n",
- "1 66 0 32\n",
- "2 67 0 32\n",
- "3 68 0 32\n",
- "4 69 0 32\n",
- "... ... ... ...\n",
- "2984 125247 -34 0\n",
- "2985 125248 -34 0\n",
- "2986 125249 -34 0\n",
- "2987 125250 -34 0\n",
- "2988 125251 -34 0\n",
+ " cp_start cp_end uppercase_map lowercase_map decimal_val repeat\n",
+ "0 49 49 0 0 1 0\n",
+ "1 1 1 0 0 1 7\n",
+ "2 8 33 0 32 -9 0\n",
+ "3 32 32 -32 -32 0 0\n",
+ "4 84 59 775 0 0 0\n",
+ "... ... ... ... ... ... ...\n",
+ "1509 34 34 -34 -34 0 0\n",
+ "1510 47 14 34 0 1 0\n",
+ "1511 1 1 0 0 1 7\n",
+ "1512 4760 4760 0 0 -8 0\n",
+ "1513 1 1 0 0 1 7\n",
"\n",
- "[2989 rows x 3 columns]"
+ "[1514 rows x 6 columns]"
]
},
"metadata": {},
@@ -1384,13 +1454,19 @@
" 'isPrintable': 'printable',\n",
" 'isDecimal': 'decimal',\n",
" 'isDigit': 'digit',\n",
- " 'isNumberLike': 'number_like'\n",
+ " 'isNumberLike': 'number_like',\n",
+ " 'getDecimal': 'decimal_val',\n",
+ " 'getDigit': 'digit_val',\n",
+ " 'getNumberLike': 'number_like_val'\n",
"}\n",
"\n",
"# Categorize these keys\n",
"str_props = ['name','decomposition']\n",
"bool_props = [ y for x, y in mapping.items() if x.startswith('is') ]\n",
- "int_props = [ 'decomposition_type', 'uppercase_map', 'lowercase_map', 'titlecase_map' ]\n",
+ "int_props = [\n",
+ " 'decomposition_type', 'uppercase_map', 'lowercase_map', 'titlecase_map',\n",
+ " 'decimal_val', 'digit_val', 'number_like_val'\n",
+ "]\n",
"\n",
"keep_cols = ['codepoint'] # Always keep codepoint\n",
"keep_cols += [ col_name for cfg_key, col_name in mapping.items() if u_cfg.get(cfg_key) ]\n",
@@ -1425,28 +1501,42 @@
"df_int = df_int .sort_values('codepoint').reset_index(drop=True)\n",
"df_str = df_str .sort_values('codepoint').reset_index(drop=True)\n",
"\n",
+ "# [DELTA ENCODING]\n",
+ "def deltaEncode(df:pd.DataFrame, cols:list[str]):\n",
+ " changed = (df[cols] != df[cols].shift()).any(axis=1)\n",
+ " # Identify where the codepoint sequence breaks (gap > 1)\n",
+ " sequence_broken = (df['codepoint'].diff() > 1)\n",
+ " # A new range starts if metadata changed OR the sequence broke\n",
+ " range_id = (changed | sequence_broken).cumsum()\n",
+ " # 4. Group by the range_id and aggregate\n",
+ " df = df.groupby(range_id).agg(\n",
+ " cp_start=('codepoint', 'min'),\n",
+ " cp_end=('codepoint', 'max'),\n",
+ " **{col: (col, 'first') for col in cols}\n",
+ " )\n",
+ " df = df.reset_index(drop=True)\n",
+ " return df\n",
+ "\n",
+ "# [RUN LENGTH ENCODING]\n",
+ "def runLengthEncoding(df):\n",
+ " deltas = df.diff().fillna(df).astype(int)\n",
+ " change_mask = (deltas != deltas.shift()).any(axis=1)\n",
+ " group_ids = change_mask.cumsum()\n",
+ " encoded_df = deltas.groupby(group_ids).first()\n",
+ " encoded_df['repeat'] = deltas.groupby(group_ids).size() - 1\n",
+ " return encoded_df.reset_index(drop=True)\n",
+ "\n",
"# [BOOLEAN OPTIMIZATION]\n",
- "# Identify where boolean changes (comparing current row to previous)\n",
- "# We exclude the 'codepoint' from this comparison\n",
- "metadata_cols = [c for c in df_bool.columns if c != 'codepoint']\n",
- "metadata_changed = (df_bool[metadata_cols] != df_bool[metadata_cols].shift()).any(axis=1)\n",
- "# Identify where the codepoint sequence breaks (gap > 1)\n",
- "sequence_broken = (df_bool['codepoint'].diff() > 1)\n",
- "# A new range starts if metadata changed OR the sequence broke\n",
- "range_id = (metadata_changed | sequence_broken).cumsum()\n",
- "# 4. Group by the range_id and aggregate\n",
- "df_bool = df_bool.groupby(range_id).agg(\n",
- " cp_start=('codepoint', 'min'),\n",
- " cp_end=('codepoint', 'max'),\n",
- " **{col: (col, 'first') for col in metadata_cols}\n",
- ")\n",
- "df_bool = df_bool.reset_index(drop=True)\n",
+ "df_bool = deltaEncode(df_bool, [c for c in df_bool.columns if c != 'codepoint'])\n",
"\n",
"# [INT OPTIMIZATION]\n",
"for k in ['uppercase_map', 'lowercase_map', 'titlecase_map']:\n",
- " if k not in df_int.columns: continue\n",
+ " if k not in df_int.columns: continue # vvv make mapping relative vvv\n",
" df_int[k] = df_int.apply(lambda x : x[k] - x['codepoint'] if x[k] != 0 else 0, axis=1)\n",
"\n",
+ "df_int = deltaEncode(df_int, [c for c in df_int.columns if c != 'codepoint'])\n",
+ "df_int = runLengthEncoding(df_int)\n",
+ "\n",
"display(df_bool)\n",
"display(df_int)\n",
"display(df_str)"
@@ -1454,14 +1544,14 @@
},
{
"cell_type": "code",
- "execution_count": 19,
- "id": "73cf0f32",
+ "execution_count": 25,
+ "id": "3959426e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "False"
+ "'Size of boolean dataframe: 8667'"
]
},
"metadata": {},
@@ -1470,16 +1560,7 @@
{
"data": {
"text/plain": [
- "False"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "True"
+ "'Size of int dataframe: 36336'"
]
},
"metadata": {},
@@ -1487,24 +1568,13 @@
}
],
"source": [
- "# TEST #\n",
- "display(df_str.empty)\n",
- "display(df_bool.empty)\n",
- "display(df_int.empty)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2a0085c0",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Format of binaries\n",
- "# \"X-ICU\"\n",
- "# \n",
- "# \n",
- "# "
+ "# TEST BINARY SIZE #\n",
+ "codepoint_size = 4\n",
+ "flag_size = math.ceil(1./8. * (len(df_bool.columns) - 2))\n",
+ "\n",
+ "display(f\"Size of boolean dataframe: {(2 * codepoint_size + flag_size) * len(df_bool)}\")\n",
+ "display(f\"Size of int dataframe: {(len(df_int.columns) * codepoint_size) * len(df_int)}\")\n",
+ "# TODO: size of string dataframe"
]
},
{
@@ -1516,17 +1586,7 @@
"source": [
"# EXPORT UNICODE #\n",
"OUT_DIR = './out'\n",
- "str_cols = [ 'name', 'decomposition' ]\n",
- "cp_size = 4\n",
- "same_size = CONFIG['general']['zeroPadStrings'] \\\n",
- " or CONFIG['general']['exportStringsInDedicatedFile'] \\\n",
- " or set(str_cols).isdisjoint(df.columns)\n",
- "\n",
- "\n",
- "\n",
- "# Select properties which are, and their size\n",
- "for cfg_key, col_name in mapping.items():\n",
- " if cfg_key in df.columns and cfg_key.startswith('is'):"
+ "\n"
]
},
{
@@ -1534,18 +1594,7 @@
"execution_count": null,
"id": "e75c5b2f",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": []
},
{