diff --git a/py-gen.ipynb b/py-gen.ipynb index e55029a..f34c779 100644 --- a/py-gen.ipynb +++ b/py-gen.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 19, "id": "2baaf74f", "metadata": {}, "outputs": [], @@ -20,12 +20,13 @@ "import os\n", "import numpy as np\n", "import pandas as pd\n", - "import urllib.request\n" + "import urllib.request\n", + "import math" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 2, "id": "41361ca4", "metadata": {}, "outputs": [], @@ -76,6 +77,11 @@ " # like superscripts or circled numbers.\n", " # Usually used by search engines.\n", " 'isNumberLike':False, # Neither of the previous two, but number like in nature, like 3/4\n", + "\n", + " # These getters will get a numeric value out of the char\n", + " 'getDecimal': True,\n", + " 'getDigit': False,\n", + " 'getNumberLike': False,\n", " }\n", "}\n" ] @@ -189,6 +195,10 @@ " out_row['digit'] = is_digit\n", " out_row['number_like'] = is_numeric\n", "\n", + " out_row['decimal_val'] = int(in_row['decimal_val']) if is_decimal else 0\n", + " out_row['digit_val'] = int(in_row['digit_val']) if is_digit else 0\n", + " out_row['number_like_val'] = in_row['numeric_val'] if is_numeric else None\n", + "\n", " # return the data\n", " return out_row\n", "\n", @@ -253,9 +263,7 @@ " combining\n", " letter\n", " uppercase\n", - " lowercase\n", - " titlecase\n", - " uppercase_map\n", + " ...\n", " lowercase_map\n", " titlecase_map\n", " whitespace\n", @@ -263,6 +271,9 @@ " decimal\n", " digit\n", " number_like\n", + " decimal_val\n", + " digit_val\n", + " number_like_val\n", " \n", " \n", " \n", @@ -278,16 +289,17 @@ " False\n", " False\n", " False\n", + " ...\n", + " 0\n", + " 0\n", + " False\n", + " False\n", + " False\n", " False\n", " False\n", " 0\n", " 0\n", - " 0\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " NaN\n", " \n", " \n", " 1\n", @@ -301,16 +313,17 @@ " False\n", " False\n", " False\n", + " ...\n", + " 0\n", + " 0\n", + " False\n", + " False\n", + " False\n", " False\n", " False\n", " 0\n", " 0\n", - " 0\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " NaN\n", " \n", " \n", " 2\n", @@ -324,16 +337,17 @@ " False\n", " False\n", " False\n", + " ...\n", + " 0\n", + " 0\n", + " False\n", + " False\n", + " False\n", " False\n", " False\n", " 0\n", " 0\n", - " 0\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " NaN\n", " \n", " \n", " 3\n", @@ -347,16 +361,17 @@ " False\n", " False\n", " False\n", + " ...\n", + " 0\n", + " 0\n", + " False\n", + " False\n", + " False\n", " False\n", " False\n", " 0\n", " 0\n", - " 0\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " NaN\n", " \n", " \n", " 4\n", @@ -370,16 +385,17 @@ " False\n", " False\n", " False\n", + " ...\n", + " 0\n", + " 0\n", + " False\n", + " False\n", + " False\n", " False\n", " False\n", " 0\n", " 0\n", - " 0\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " NaN\n", " \n", " \n", " ...\n", @@ -403,6 +419,7 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", " 299377\n", @@ -416,16 +433,17 @@ " False\n", " False\n", " False\n", + " ...\n", + " 0\n", + " 0\n", + " False\n", + " False\n", + " False\n", " False\n", " False\n", " 0\n", " 0\n", - " 0\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " NaN\n", " \n", " \n", " 299378\n", @@ -439,16 +457,17 @@ " False\n", " False\n", " False\n", + " ...\n", + " 0\n", + " 0\n", + " False\n", + " False\n", + " False\n", " False\n", " False\n", " 0\n", " 0\n", - " 0\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " NaN\n", " \n", " \n", " 299379\n", @@ -462,16 +481,17 @@ " False\n", " False\n", " False\n", + " ...\n", + " 0\n", + " 0\n", + " False\n", + " False\n", + " False\n", " False\n", " False\n", " 0\n", " 0\n", - " 0\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " NaN\n", " \n", " \n", " 299380\n", @@ -485,16 +505,17 @@ " False\n", " False\n", " False\n", + " ...\n", + " 0\n", + " 0\n", + " False\n", + " False\n", + " False\n", " False\n", " False\n", " 0\n", " 0\n", - " 0\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " NaN\n", " \n", " \n", " 299381\n", @@ -508,20 +529,21 @@ " False\n", " False\n", " False\n", + " ...\n", + " 0\n", + " 0\n", + " False\n", + " False\n", + " False\n", " False\n", " False\n", " 0\n", " 0\n", - " 0\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " NaN\n", " \n", " \n", "\n", - "

299382 rows × 20 columns

\n", + "

299382 rows × 23 columns

\n", "" ], "text/plain": [ @@ -551,33 +573,33 @@ "299380 NaN False False False False False \n", "299381 NaN False False False False False \n", "\n", - " lowercase titlecase uppercase_map lowercase_map titlecase_map \\\n", - "0 False False 0 0 0 \n", - "1 False False 0 0 0 \n", - "2 False False 0 0 0 \n", - "3 False False 0 0 0 \n", - "4 False False 0 0 0 \n", - "... ... ... ... ... ... \n", - "299377 False False 0 0 0 \n", - "299378 False False 0 0 0 \n", - "299379 False False 0 0 0 \n", - "299380 False False 0 0 0 \n", - "299381 False False 0 0 0 \n", + " ... lowercase_map titlecase_map whitespace printable decimal \\\n", + "0 ... 0 0 False False False \n", + "1 ... 0 0 False False False \n", + "2 ... 0 0 False False False \n", + "3 ... 0 0 False False False \n", + "4 ... 0 0 False False False \n", + "... ... ... ... ... ... ... \n", + "299377 ... 0 0 False False False \n", + "299378 ... 0 0 False False False \n", + "299379 ... 0 0 False False False \n", + "299380 ... 0 0 False False False \n", + "299381 ... 0 0 False False False \n", "\n", - " whitespace printable decimal digit number_like \n", - "0 False False False False False \n", - "1 False False False False False \n", - "2 False False False False False \n", - "3 False False False False False \n", - "4 False False False False False \n", - "... ... ... ... ... ... \n", - "299377 False False False False False \n", - "299378 False False False False False \n", - "299379 False False False False False \n", - "299380 False False False False False \n", - "299381 False False False False False \n", + " digit number_like decimal_val digit_val number_like_val \n", + "0 False False 0 0 NaN \n", + "1 False False 0 0 NaN \n", + "2 False False 0 0 NaN \n", + "3 False False 0 0 NaN \n", + "4 False False 0 0 NaN \n", + "... ... ... ... ... ... \n", + "299377 False False 0 0 NaN \n", + "299378 False False 0 0 NaN \n", + "299379 False False 0 0 NaN \n", + "299380 False False 0 0 NaN \n", + "299381 False False 0 0 NaN \n", "\n", - "[299382 rows x 20 columns]" + "[299382 rows x 23 columns]" ] }, "execution_count": 5, @@ -634,9 +656,7 @@ " combining\n", " letter\n", " uppercase\n", - " lowercase\n", - " titlecase\n", - " uppercase_map\n", + " ...\n", " lowercase_map\n", " titlecase_map\n", " whitespace\n", @@ -644,123 +664,131 @@ " decimal\n", " digit\n", " number_like\n", + " decimal_val\n", + " digit_val\n", + " number_like_val\n", " \n", " \n", " \n", " \n", - " 160\n", - " 160\n", - " NO-BREAK SPACE\n", - " NON-BREAKING SPACE\n", - " <noBreak> 0020\n", - " noBreak\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " 0\n", - " 0\n", - " 0\n", - " True\n", - " True\n", - " False\n", - " False\n", - " False\n", - " \n", - " \n", - " 168\n", - " 168\n", - " DIAERESIS\n", - " SPACING DIAERESIS\n", - " <compat> 0020 0308\n", - " compat\n", - " False\n", - " True\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " 0\n", - " 0\n", - " 0\n", - " False\n", - " True\n", - " False\n", - " False\n", - " False\n", - " \n", - " \n", - " 170\n", - " 170\n", - " FEMININE ORDINAL INDICATOR\n", + " 48\n", + " 48\n", + " DIGIT ZERO\n", " \n", - " <super> 0061\n", - " super\n", + " \n", + " NaN\n", " False\n", " False\n", " False\n", - " True\n", " False\n", " False\n", - " False\n", - " 0\n", + " ...\n", " 0\n", " 0\n", " False\n", " True\n", - " False\n", - " False\n", - " False\n", + " True\n", + " True\n", + " True\n", + " 0\n", + " 0\n", + " 0\n", " \n", " \n", - " 175\n", - " 175\n", - " MACRON\n", - " SPACING MACRON\n", - " <compat> 0020 0304\n", - " compat\n", - " False\n", - " True\n", + " 49\n", + " 49\n", + " DIGIT ONE\n", + " \n", + " \n", + " NaN\n", " False\n", " False\n", " False\n", " False\n", " False\n", - " 0\n", + " ...\n", " 0\n", " 0\n", " False\n", " True\n", - " False\n", - " False\n", - " False\n", + " True\n", + " True\n", + " True\n", + " 1\n", + " 1\n", + " 1\n", " \n", " \n", - " 178\n", - " 178\n", - " SUPERSCRIPT TWO\n", - " SUPERSCRIPT DIGIT TWO\n", - " <super> 0032\n", - " super\n", + " 50\n", + " 50\n", + " DIGIT TWO\n", + " \n", + " \n", + " NaN\n", " False\n", " False\n", " False\n", " False\n", " False\n", - " False\n", - " False\n", - " 0\n", + " ...\n", " 0\n", " 0\n", " False\n", " True\n", + " True\n", + " True\n", + " True\n", + " 2\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 51\n", + " 51\n", + " DIGIT THREE\n", + " \n", + " \n", + " NaN\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " ...\n", + " 0\n", + " 0\n", " False\n", " True\n", " True\n", + " True\n", + " True\n", + " 3\n", + " 3\n", + " 3\n", + " \n", + " \n", + " 52\n", + " 52\n", + " DIGIT FOUR\n", + " \n", + " \n", + " NaN\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " ...\n", + " 0\n", + " 0\n", + " False\n", + " True\n", + " True\n", + " True\n", + " True\n", + " 4\n", + " 4\n", + " 4\n", " \n", " \n", " ...\n", @@ -784,6 +812,7 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", " 93029\n", @@ -797,9 +826,7 @@ " False\n", " False\n", " False\n", - " False\n", - " False\n", - " 0\n", + " ...\n", " 0\n", " 0\n", " False\n", @@ -807,6 +834,9 @@ " True\n", " True\n", " True\n", + " 5\n", + " 5\n", + " 5\n", " \n", " \n", " 93030\n", @@ -820,9 +850,7 @@ " False\n", " False\n", " False\n", - " False\n", - " False\n", - " 0\n", + " ...\n", " 0\n", " 0\n", " False\n", @@ -830,6 +858,9 @@ " True\n", " True\n", " True\n", + " 6\n", + " 6\n", + " 6\n", " \n", " \n", " 93031\n", @@ -843,9 +874,7 @@ " False\n", " False\n", " False\n", - " False\n", - " False\n", - " 0\n", + " ...\n", " 0\n", " 0\n", " False\n", @@ -853,6 +882,9 @@ " True\n", " True\n", " True\n", + " 7\n", + " 7\n", + " 7\n", " \n", " \n", " 93032\n", @@ -866,9 +898,7 @@ " False\n", " False\n", " False\n", - " False\n", - " False\n", - " 0\n", + " ...\n", " 0\n", " 0\n", " False\n", @@ -876,6 +906,9 @@ " True\n", " True\n", " True\n", + " 8\n", + " 8\n", + " 8\n", " \n", " \n", " 93033\n", @@ -889,9 +922,7 @@ " False\n", " False\n", " False\n", - " False\n", - " False\n", - " 0\n", + " ...\n", " 0\n", " 0\n", " False\n", @@ -899,66 +930,69 @@ " True\n", " True\n", " True\n", + " 9\n", + " 9\n", + " 9\n", " \n", " \n", "\n", - "

3833 rows × 20 columns

\n", + "

770 rows × 23 columns

\n", "" ], "text/plain": [ - " codepoint name old_name \\\n", - "160 160 NO-BREAK SPACE NON-BREAKING SPACE \n", - "168 168 DIAERESIS SPACING DIAERESIS \n", - "170 170 FEMININE ORDINAL INDICATOR \n", - "175 175 MACRON SPACING MACRON \n", - "178 178 SUPERSCRIPT TWO SUPERSCRIPT DIGIT TWO \n", - "... ... ... ... \n", - "93029 130037 SEGMENTED DIGIT FIVE \n", - "93030 130038 SEGMENTED DIGIT SIX \n", - "93031 130039 SEGMENTED DIGIT SEVEN \n", - "93032 130040 SEGMENTED DIGIT EIGHT \n", - "93033 130041 SEGMENTED DIGIT NINE \n", + " codepoint name old_name decomposition \\\n", + "48 48 DIGIT ZERO \n", + "49 49 DIGIT ONE \n", + "50 50 DIGIT TWO \n", + "51 51 DIGIT THREE \n", + "52 52 DIGIT FOUR \n", + "... ... ... ... ... \n", + "93029 130037 SEGMENTED DIGIT FIVE 0035 \n", + "93030 130038 SEGMENTED DIGIT SIX 0036 \n", + "93031 130039 SEGMENTED DIGIT SEVEN 0037 \n", + "93032 130040 SEGMENTED DIGIT EIGHT 0038 \n", + "93033 130041 SEGMENTED DIGIT NINE 0039 \n", "\n", - " decomposition decomposition_type punctuation symbol combining \\\n", - "160 0020 noBreak False False False \n", - "168 0020 0308 compat False True False \n", - "170 0061 super False False False \n", - "175 0020 0304 compat False True False \n", - "178 0032 super False False False \n", - "... ... ... ... ... ... \n", - "93029 0035 font False False False \n", - "93030 0036 font False False False \n", - "93031 0037 font False False False \n", - "93032 0038 font False False False \n", - "93033 0039 font False False False \n", + " decomposition_type punctuation symbol combining letter uppercase \\\n", + "48 NaN False False False False False \n", + "49 NaN False False False False False \n", + "50 NaN False False False False False \n", + "51 NaN False False False False False \n", + "52 NaN False False False False False \n", + "... ... ... ... ... ... ... \n", + "93029 font False False False False False \n", + "93030 font False False False False False \n", + "93031 font False False False False False \n", + "93032 font False False False False False \n", + "93033 font False False False False False \n", "\n", - " letter uppercase lowercase titlecase uppercase_map lowercase_map \\\n", - "160 False False False False 0 0 \n", - "168 False False False False 0 0 \n", - "170 True False False False 0 0 \n", - "175 False False False False 0 0 \n", - "178 False False False False 0 0 \n", - "... ... ... ... ... ... ... \n", - "93029 False False False False 0 0 \n", - "93030 False False False False 0 0 \n", - "93031 False False False False 0 0 \n", - "93032 False False False False 0 0 \n", - "93033 False False False False 0 0 \n", + " ... lowercase_map titlecase_map whitespace printable decimal \\\n", + "48 ... 0 0 False True True \n", + "49 ... 0 0 False True True \n", + "50 ... 0 0 False True True \n", + "51 ... 0 0 False True True \n", + "52 ... 0 0 False True True \n", + "... ... ... ... ... ... ... \n", + "93029 ... 0 0 False True True \n", + "93030 ... 0 0 False True True \n", + "93031 ... 0 0 False True True \n", + "93032 ... 0 0 False True True \n", + "93033 ... 0 0 False True True \n", "\n", - " titlecase_map whitespace printable decimal digit number_like \n", - "160 0 True True False False False \n", - "168 0 False True False False False \n", - "170 0 False True False False False \n", - "175 0 False True False False False \n", - "178 0 False True False True True \n", - "... ... ... ... ... ... ... \n", - "93029 0 False True True True True \n", - "93030 0 False True True True True \n", - "93031 0 False True True True True \n", - "93032 0 False True True True True \n", - "93033 0 False True True True True \n", + " digit number_like decimal_val digit_val number_like_val \n", + "48 True True 0 0 0 \n", + "49 True True 1 1 1 \n", + "50 True True 2 2 2 \n", + "51 True True 3 3 3 \n", + "52 True True 4 4 4 \n", + "... ... ... ... ... ... \n", + "93029 True True 5 5 5 \n", + "93030 True True 6 6 6 \n", + "93031 True True 7 7 7 \n", + "93032 True True 8 8 8 \n", + "93033 True True 9 9 9 \n", "\n", - "[3833 rows x 20 columns]" + "[770 rows x 23 columns]" ] }, "execution_count": 6, @@ -968,12 +1002,12 @@ ], "source": [ "# TEST #\n", - "df[~df['decomposition_type'].isna()]" + "df[df['decimal'] == True]" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 18, "id": "d29860e8", "metadata": {}, "outputs": [ @@ -1152,98 +1186,134 @@ " \n", " \n", " \n", - " codepoint\n", + " cp_start\n", + " cp_end\n", " uppercase_map\n", " lowercase_map\n", + " decimal_val\n", + " repeat\n", " \n", " \n", " \n", " \n", " 0\n", - " 65\n", + " 49\n", + " 49\n", + " 0\n", + " 0\n", + " 1\n", " 0\n", - " 32\n", " \n", " \n", " 1\n", - " 66\n", + " 1\n", + " 1\n", " 0\n", - " 32\n", + " 0\n", + " 1\n", + " 7\n", " \n", " \n", " 2\n", - " 67\n", + " 8\n", + " 33\n", " 0\n", " 32\n", + " -9\n", + " 0\n", " \n", " \n", " 3\n", - " 68\n", - " 0\n", " 32\n", + " 32\n", + " -32\n", + " -32\n", + " 0\n", + " 0\n", " \n", " \n", " 4\n", - " 69\n", + " 84\n", + " 59\n", + " 775\n", + " 0\n", + " 0\n", " 0\n", - " 32\n", " \n", " \n", " ...\n", " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 2984\n", - " 125247\n", + " 1509\n", + " 34\n", + " 34\n", " -34\n", + " -34\n", + " 0\n", " 0\n", " \n", " \n", - " 2985\n", - " 125248\n", - " -34\n", + " 1510\n", + " 47\n", + " 14\n", + " 34\n", + " 0\n", + " 1\n", " 0\n", " \n", " \n", - " 2986\n", - " 125249\n", - " -34\n", + " 1511\n", + " 1\n", + " 1\n", + " 0\n", + " 0\n", + " 1\n", + " 7\n", + " \n", + " \n", + " 1512\n", + " 4760\n", + " 4760\n", + " 0\n", + " 0\n", + " -8\n", " 0\n", " \n", " \n", - " 2987\n", - " 125250\n", - " -34\n", + " 1513\n", + " 1\n", + " 1\n", " 0\n", - " \n", - " \n", - " 2988\n", - " 125251\n", - " -34\n", " 0\n", + " 1\n", + " 7\n", " \n", " \n", "\n", - "

2989 rows × 3 columns

\n", + "

1514 rows × 6 columns

\n", "" ], "text/plain": [ - " codepoint uppercase_map lowercase_map\n", - "0 65 0 32\n", - "1 66 0 32\n", - "2 67 0 32\n", - "3 68 0 32\n", - "4 69 0 32\n", - "... ... ... ...\n", - "2984 125247 -34 0\n", - "2985 125248 -34 0\n", - "2986 125249 -34 0\n", - "2987 125250 -34 0\n", - "2988 125251 -34 0\n", + " cp_start cp_end uppercase_map lowercase_map decimal_val repeat\n", + "0 49 49 0 0 1 0\n", + "1 1 1 0 0 1 7\n", + "2 8 33 0 32 -9 0\n", + "3 32 32 -32 -32 0 0\n", + "4 84 59 775 0 0 0\n", + "... ... ... ... ... ... ...\n", + "1509 34 34 -34 -34 0 0\n", + "1510 47 14 34 0 1 0\n", + "1511 1 1 0 0 1 7\n", + "1512 4760 4760 0 0 -8 0\n", + "1513 1 1 0 0 1 7\n", "\n", - "[2989 rows x 3 columns]" + "[1514 rows x 6 columns]" ] }, "metadata": {}, @@ -1384,13 +1454,19 @@ " 'isPrintable': 'printable',\n", " 'isDecimal': 'decimal',\n", " 'isDigit': 'digit',\n", - " 'isNumberLike': 'number_like'\n", + " 'isNumberLike': 'number_like',\n", + " 'getDecimal': 'decimal_val',\n", + " 'getDigit': 'digit_val',\n", + " 'getNumberLike': 'number_like_val'\n", "}\n", "\n", "# Categorize these keys\n", "str_props = ['name','decomposition']\n", "bool_props = [ y for x, y in mapping.items() if x.startswith('is') ]\n", - "int_props = [ 'decomposition_type', 'uppercase_map', 'lowercase_map', 'titlecase_map' ]\n", + "int_props = [\n", + " 'decomposition_type', 'uppercase_map', 'lowercase_map', 'titlecase_map',\n", + " 'decimal_val', 'digit_val', 'number_like_val'\n", + "]\n", "\n", "keep_cols = ['codepoint'] # Always keep codepoint\n", "keep_cols += [ col_name for cfg_key, col_name in mapping.items() if u_cfg.get(cfg_key) ]\n", @@ -1425,28 +1501,42 @@ "df_int = df_int .sort_values('codepoint').reset_index(drop=True)\n", "df_str = df_str .sort_values('codepoint').reset_index(drop=True)\n", "\n", + "# [DELTA ENCODING]\n", + "def deltaEncode(df:pd.DataFrame, cols:list[str]):\n", + " changed = (df[cols] != df[cols].shift()).any(axis=1)\n", + " # Identify where the codepoint sequence breaks (gap > 1)\n", + " sequence_broken = (df['codepoint'].diff() > 1)\n", + " # A new range starts if metadata changed OR the sequence broke\n", + " range_id = (changed | sequence_broken).cumsum()\n", + " # 4. Group by the range_id and aggregate\n", + " df = df.groupby(range_id).agg(\n", + " cp_start=('codepoint', 'min'),\n", + " cp_end=('codepoint', 'max'),\n", + " **{col: (col, 'first') for col in cols}\n", + " )\n", + " df = df.reset_index(drop=True)\n", + " return df\n", + "\n", + "# [RUN LENGTH ENCODING]\n", + "def runLengthEncoding(df):\n", + " deltas = df.diff().fillna(df).astype(int)\n", + " change_mask = (deltas != deltas.shift()).any(axis=1)\n", + " group_ids = change_mask.cumsum()\n", + " encoded_df = deltas.groupby(group_ids).first()\n", + " encoded_df['repeat'] = deltas.groupby(group_ids).size() - 1\n", + " return encoded_df.reset_index(drop=True)\n", + "\n", "# [BOOLEAN OPTIMIZATION]\n", - "# Identify where boolean changes (comparing current row to previous)\n", - "# We exclude the 'codepoint' from this comparison\n", - "metadata_cols = [c for c in df_bool.columns if c != 'codepoint']\n", - "metadata_changed = (df_bool[metadata_cols] != df_bool[metadata_cols].shift()).any(axis=1)\n", - "# Identify where the codepoint sequence breaks (gap > 1)\n", - "sequence_broken = (df_bool['codepoint'].diff() > 1)\n", - "# A new range starts if metadata changed OR the sequence broke\n", - "range_id = (metadata_changed | sequence_broken).cumsum()\n", - "# 4. Group by the range_id and aggregate\n", - "df_bool = df_bool.groupby(range_id).agg(\n", - " cp_start=('codepoint', 'min'),\n", - " cp_end=('codepoint', 'max'),\n", - " **{col: (col, 'first') for col in metadata_cols}\n", - ")\n", - "df_bool = df_bool.reset_index(drop=True)\n", + "df_bool = deltaEncode(df_bool, [c for c in df_bool.columns if c != 'codepoint'])\n", "\n", "# [INT OPTIMIZATION]\n", "for k in ['uppercase_map', 'lowercase_map', 'titlecase_map']:\n", - " if k not in df_int.columns: continue\n", + " if k not in df_int.columns: continue # vvv make mapping relative vvv\n", " df_int[k] = df_int.apply(lambda x : x[k] - x['codepoint'] if x[k] != 0 else 0, axis=1)\n", "\n", + "df_int = deltaEncode(df_int, [c for c in df_int.columns if c != 'codepoint'])\n", + "df_int = runLengthEncoding(df_int)\n", + "\n", "display(df_bool)\n", "display(df_int)\n", "display(df_str)" @@ -1454,14 +1544,14 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "73cf0f32", + "execution_count": 25, + "id": "3959426e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "False" + "'Size of boolean dataframe: 8667'" ] }, "metadata": {}, @@ -1470,16 +1560,7 @@ { "data": { "text/plain": [ - "False" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "True" + "'Size of int dataframe: 36336'" ] }, "metadata": {}, @@ -1487,24 +1568,13 @@ } ], "source": [ - "# TEST #\n", - "display(df_str.empty)\n", - "display(df_bool.empty)\n", - "display(df_int.empty)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a0085c0", - "metadata": {}, - "outputs": [], - "source": [ - "# Format of binaries\n", - "# \"X-ICU\"\n", - "# \n", - "# \n", - "# " + "# TEST BINARY SIZE #\n", + "codepoint_size = 4\n", + "flag_size = math.ceil(1./8. * (len(df_bool.columns) - 2))\n", + "\n", + "display(f\"Size of boolean dataframe: {(2 * codepoint_size + flag_size) * len(df_bool)}\")\n", + "display(f\"Size of int dataframe: {(len(df_int.columns) * codepoint_size) * len(df_int)}\")\n", + "# TODO: size of string dataframe" ] }, { @@ -1516,17 +1586,7 @@ "source": [ "# EXPORT UNICODE #\n", "OUT_DIR = './out'\n", - "str_cols = [ 'name', 'decomposition' ]\n", - "cp_size = 4\n", - "same_size = CONFIG['general']['zeroPadStrings'] \\\n", - " or CONFIG['general']['exportStringsInDedicatedFile'] \\\n", - " or set(str_cols).isdisjoint(df.columns)\n", - "\n", - "\n", - "\n", - "# Select properties which are, and their size\n", - "for cfg_key, col_name in mapping.items():\n", - " if cfg_key in df.columns and cfg_key.startswith('is'):" + "\n" ] }, { @@ -1534,18 +1594,7 @@ "execution_count": null, "id": "e75c5b2f", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [] }, {