From af788cc25faed897627e15cdb7f265dc59d9e963 Mon Sep 17 00:00:00 2001 From: Kittycannon Date: Thu, 26 Feb 2026 09:00:10 -0600 Subject: [PATCH] somethings --- py-gen.ipynb | 168 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) diff --git a/py-gen.ipynb b/py-gen.ipynb index b07a80d..a63764c 100644 --- a/py-gen.ipynb +++ b/py-gen.ipynb @@ -15,6 +15,174 @@ "id": "2baaf74f", "metadata": {}, "outputs": [], + "source": [ + "# Add libraries..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41361ca4", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure what to get\n", + "config = {\n", + " 'general':{\n", + " # If true, it will export the strings\n", + " # on its own dedicated file.\n", + " 'exportStringsInDedicatedFile':True,\n", + " # If true, each strings' block will\n", + " # be padded so they use an uniform size.\n", + " # This makes jumping between blocks very\n", + " # straightforward and fast.\n", + " 'zeroPadStrings':True,\n", + " },\n", + " 'sources':{\n", + " 'unicode':'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt',\n", + " },\n", + " 'unicode':{\n", + " # Unicode 1.0 name, considered \"old\"\n", + " # Set to None to use the old name if the\n", + " # current name is not descriptive\n", + " # e.g. use \"START OF TEXT\" instead of \"\"\n", + " 'useOldName':False,\n", + " 'getName':False, # Formal name of the codepoint\n", + " # PROBABLY REMOVE # 'getCategory':False,\n", + " # TODO # 'getScript': True, # e.g., \"Cyrillic\"\n", + " # TODO # 'getBlock': True, # e.g., \"Mathematical Alphanumeric Symbols\"\n", + "\n", + " 'getDecomposition':False, # aka \"ascii-fy\"\n", + " 'getDecompositionType':False, # metadata about decomposition\n", + "\n", + " 'toLowercase':False,\n", + " 'toUppercase':False,\n", + " 'toTitlecase':False,\n", + "\n", + " 'isEmoji': True, # Does it have an emoji presentation?\n", + " 'isPunctuation': True, # General Category starts with 'P'\n", + " 'isSymbol': True, # General Category starts with 'S' (Math, Currency)\n", + " 'isCombining': True, # Is it a mark/accent that needs a base letter?\n", + "\n", + " 'isPrintable':True,\n", + " 'isSpace':True,\n", + " 'isWhitespace':True,\n", + " 'isLetter':True,\n", + " 'isUppercase':True,\n", + " 'isLowercase':True,\n", + " 'isTitlecase':True,\n", + " 'isDeprecated':True,\n", + "\n", + " # NOTE: The following fields work in a cascade,\n", + " # so if one field is true, the next one is also true.\n", + " 'isDecimal':True, # An actual number 0-9, in different languages\n", + " 'isDigit':True, # Digits not used in standard positional notation,\n", + " # like superscripts or circled numbers.\n", + " # Usually used by search engines.\n", + " 'isNumberLike':True, # Neither of the previous two, but number like in nature, like 3/4\n", + " }\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94696c64", + "metadata": {}, + "outputs": [], + "source": [ + "def unicodeDataParseLine(text_line:str, config:dict):\n", + " # 10D84;GARAY SMALL LETTER OLD KA;Ll;0;R;;;;;N;;;10D64;;10D64\n", + " in_keys = [\n", + " \"codepoint\", \"name\", \"category\", \"combining_class\", \"bidi_class\",\n", + " \"decomposition\", \"decimal_val\", \"digit_val\", \"numeric_val\",\n", + " \"bidi_mirrored\", \"unicode_1_name\", \"iso_comment\", \n", + " \"uppercase_map\", \"lowercase_map\", \"titlecase_map\"\n", + " ]\n", + " parts = text_line.strip().split(';')\n", + " in_row = dict(zip(in_keys, parts))\n", + " \n", + " # Now, parse the fields individually\n", + " out_row = {}\n", + "\n", + " # Parse codepoint\n", + " out_row['codepoint'] = int(in_row['codepoint'], 16)\n", + " char = chr(out_row['codepoint'])\n", + "\n", + " out_row['name'] = in_row['name']\n", + " out_row['old_name'] = in_row['unicode_1_name']\n", + "\n", + " out_row['decomposition'] = in_row['decomposition']\n", + " \n", + " # Extracts text inside <> like or \n", + " decomp = in_row['decomposition']\n", + " out_row['decomposition_type'] = decomp[1:decomp.find('>')] if '<' in decomp else None\n", + "\n", + " category = in_row['category']\n", + " out_row['punctuation'] = category.startswith('P') \n", + " out_row['symbol'] = category.startswith('S')\n", + " out_row['combining'] = category.startswith('M') # Mark category\n", + " out_row['letter'] = category.startswith('L')\n", + "\n", + " out_row['uppercase'] = category == 'Lu'\n", + " out_row['lowercase'] = category == 'Ll'\n", + " out_row['titlecase'] = category == 'Lt'\n", + "\n", + " # Zs = Space Separator, but also check common control whitespaces\n", + " out_row['whitespace'] = category == 'Zs' or char in '\\t\\n\\r\\f\\v'\n", + " \n", + " # Non-printable are usually Control (C) and some Separator (Z) categories\n", + " out_row['printable'] = not category.startswith('C')\n", + "\n", + " # Decimal (0-9) -> Digit (Superscripts) -> NumberLike (Fractions/Roman)\n", + " is_decimal = bool(in_row['decimal_val'])\n", + " is_digit = is_decimal or bool(in_row['digit_val'])\n", + " is_numeric = is_digit or bool(in_row['numeric_val'])\n", + "\n", + " out_row['decimal'] = is_decimal\n", + " out_row['digit'] = is_digit\n", + " out_row['number_like'] = is_numeric\n", + "\n", + " # return the data\n", + " return out_row\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a0085c0", + "metadata": {}, + "outputs": [], + "source": [ + "# Format of binaries\n", + "# [HEAD]\n", + "# \n", + "# \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a97b59a9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e75c5b2f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d0ba29e", + "metadata": {}, + "outputs": [], "source": [] } ],