spider-compiler/pygen.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "00e26c5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from lark import Lark, Transformer\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "cc16be1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "ebnf_targets = {\n",
    "    \"assembly\": {\n",
    "        \"src\": \"./samples/assembly.ebnf\",\n",
    "        \"dst\": \"./spider/compiler/assembly/AssemblyParser.hpp\",\n",
    "        \"cnt\": None,\n",
    "    },\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "e88d212f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "--- Loading EBNF Targets ---\n",
      "✅ Success [assembly]: Loaded './samples/assembly.ebnf' -> Target destination: './spider/compiler/assembly/AssemblyParser.hpp'\n"
     ]
    }
   ],
   "source": [
    "print(\"\\n--- Loading EBNF Targets ---\")\n",
    "for target_name, paths in ebnf_targets.items():\n",
    "    src_path = paths[\"src\"]\n",
    "    dst_path = paths[\"dst\"]\n",
    "    \n",
    "    try:\n",
    "        with open(src_path, \"r\", encoding=\"utf-8\") as file:\n",
    "            paths[\"cnt\"] = file.read()\n",
    "            print(f\"✅ Success [{target_name}]: Loaded '{src_path}' -> Target destination: '{dst_path}'\")\n",
    "        \n",
    "    except FileNotFoundError:\n",
    "        print(f\"❌ Error [{target_name}]: Source file not found at '{src_path}'\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "e8095002",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from lark import Lark, Transformer\n",
    "\n",
    "iso_ebnf_meta_grammar = r\"\"\"\n",
    "    start: rule+\n",
    "    rule: RULE_NAME \"=\" expression \";\"\n",
    "    \n",
    "    ?expression: alternation\n",
    "    alternation: sequence (\"|\" sequence)*\n",
    "    \n",
    "    sequence: item ( [\",\"] item )*\n",
    "    \n",
    "    ?item: atom\n",
    "         | atom \"?\" -> optional\n",
    "         | atom \"*\" -> repeat\n",
    "         | \"[\" expression \"]\" -> optional\n",
    "         | \"{\" expression \"}\" -> repeat\n",
    "         \n",
    "    ?atom: RULE_NAME -> call_rule\n",
    "         | TERMINAL -> match_terminal\n",
    "         | SPECIAL_SEQ -> handle_special\n",
    "         | \"(\" expression \")\" -> group\n",
    "\n",
    "    RULE_NAME: /[a-zA-Z_][a-zA-Z0-9_]*/\n",
    "    TERMINAL: /\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"/ | /'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'/\n",
    "    SPECIAL_SEQ: /\\?[\\s\\S]*?\\?/\n",
    "    COMMENT: /\\(\\*([\\s\\S]*?)\\*\\)/\n",
    "\n",
    "    %import common.WS\n",
    "    %ignore WS\n",
    "    %ignore COMMENT\n",
    "\"\"\"\n",
    "\n",
    "class AssemblyCppGenerator(Transformer):\n",
    "    def start(self, rules):\n",
    "        cpp_functions = \"\\n\\n\".join(rules)\n",
    "        return f\"\"\"#pragma once\n",
    "\n",
    "#include <iostream>\n",
    "#include <string>\n",
    "#include <vector>\n",
    "#include <stdexcept>\n",
    "\n",
    "class AssemblyParser {{\n",
    "private:\n",
    "    std::string src;\n",
    "    size_t pos = 0;\n",
    "\n",
    "    std::string peek_str(size_t len) {{\n",
    "        if (pos + len <= src.length()) return src.substr(pos, len);\n",
    "        return src.substr(pos);\n",
    "    }}\n",
    "\n",
    "    char peek() {{ return pos < src.length() ? src[pos] : '\\\\0'; }}\n",
    "    \n",
    "    void match_char(char expected) {{\n",
    "        if (peek() == expected) pos++;\n",
    "        else throw std::runtime_error(\"Unexpected token matching character\");\n",
    "    }}\n",
    "\n",
    "    void match_string(std::string expected) {{\n",
    "        if (peek_str(expected.length()) == expected) pos += expected.length();\n",
    "        else throw std::runtime_error(\"Unexpected token matching string: \" + expected);\n",
    "    }}\n",
    "\n",
    "    bool isUTF8Alpha() {{ return isalpha(peek()); }}\n",
    "    bool isWhithespaceCharNotCrLf() {{ return peek() == ' ' || peek() == '\\\\t'; }}\n",
    "    bool isUTF8CharNotCrLf() {{ return peek() != '\\\\r' && peek() != '\\\\n' && peek() != '\\\\0'; }}\n",
    "    bool isUTF8CharLitCont() {{ return peek() != '\\'' && peek() != '\\\\\\\\'; }}\n",
    "    bool isUTF8StringLitCont() {{ return peek() != '\"' && peek() != '\\\\\\\\'; }}\n",
    "\n",
    "public:\n",
    "    AssemblyParser(std::string input) : src(input) {{}}\n",
    "\n",
    "    void parse() {{\n",
    "        parse_program(); \n",
    "        if (pos < src.length()) throw std::runtime_error(\"Trailing characters left unparsed.\");\n",
    "        std::cout << \"Assembly source compiled cleanly!\" << std::endl;\n",
    "    }}\n",
    "\n",
    "{cpp_functions}\n",
    "}};\n",
    "\"\"\"\n",
    "\n",
    "    def rule(self, args):\n",
    "        name, expr = args\n",
    "        return f\"    void parse_{name}() {{\\n{expr}\\n    }}\"\n",
    "\n",
    "    # FIX 1: Explicitly handle choice logic using C++ style paths\n",
    "    def alternation(self, items):\n",
    "        code_lines = []\n",
    "        for i, item in enumerate(items):\n",
    "            # Clean up padding whitespace if any\n",
    "            clean_item = str(item).strip()\n",
    "            if not clean_item: continue\n",
    "            \n",
    "            # Since lookahead processing requires FIRST sets, we scaffold a sequential fallback\n",
    "            if i == 0:\n",
    "                code_lines.append(f\"        if (/* option {i+1} */ true) {{\\n    {clean_item}\\n        }}\")\n",
    "            else:\n",
    "                code_lines.append(f\"        else if (/* option {i+1} */ true) {{\\n    {clean_item}\\n        }}\")\n",
    "        return \"\\n\".join(code_lines)\n",
    "\n",
    "    def sequence(self, items):\n",
    "        flattened_items = []\n",
    "        for item in items:\n",
    "            if isinstance(item, list):\n",
    "                for sub_item in item:\n",
    "                    if sub_item: flattened_items.append(str(sub_item).strip())\n",
    "            elif item:\n",
    "                flattened_items.append(str(item).strip())\n",
    "        return \"\\n\".join(f\"        {item}\" for item in flattened_items if item)\n",
    "\n",
    "    def call_rule(self, token):\n",
    "        rule_name = token[0].value if isinstance(token, list) else token.value\n",
    "        return f\"parse_{rule_name}();\"\n",
    "\n",
    "    # FIX 2: Generate match_string instead of match_char for multi-char string keywords like \"include\"\n",
    "    def match_terminal(self, token):\n",
    "        raw_token_str = token[0].value if isinstance(token, list) else token.value\n",
    "        raw_val = raw_token_str[1:-1]\n",
    "        \n",
    "        if raw_val == r\"\\r\": return \"match_char('\\\\r');\"\n",
    "        if raw_val == r\"\\n\": return \"match_char('\\\\n');\"\n",
    "        if raw_val == r\"\\t\": return \"match_char('\\\\t');\"\n",
    "        if raw_val == r\"\\\\\": return \"match_char('\\\\\\\\');\"\n",
    "        if not raw_val: return \"// Empty string match\"\n",
    "        \n",
    "        if len(raw_val) > 1:\n",
    "            return f\"match_string(\\\"{raw_val}\\\");\"\n",
    "        return f\"match_char('{raw_val}');\"\n",
    "\n",
    "    def handle_special(self, token):\n",
    "        raw_string = token[0].value if isinstance(token, list) else token.value\n",
    "        func_name = raw_string.strip('?').strip()\n",
    "        return f\"if ({func_name}()) {{ pos++; }} else {{ throw std::runtime_error(\\\"Failed validation for {func_name}\\\"); }}\"\n",
    "\n",
    "    def optional(self, args):\n",
    "        content = args[0] if not isinstance(args[0], list) else \"\\n        \".join(args[0])\n",
    "        return f\"// Optional block\\n        if (/* lookahead check */ true) {{\\n    {content}\\n        }}\"\n",
    "\n",
    "    def repeat(self, args):\n",
    "        content = args[0] if not isinstance(args[0], list) else \"\\n        \".join(args[0])\n",
    "        return f\"// Repeat block\\n        while (/* lookahead check */ true) {{\\n    {content}\\n        }}\"\n",
    "\n",
    "    def group(self, args):\n",
    "        # Flatten grouped elements cleanly to strings\n",
    "        if isinstance(args, list):\n",
    "            return \"\\n\".join(str(x) for x in args)\n",
    "        return str(args)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "558915ff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--- Starting C++ Compilation Loop ---\n",
      "Parsing and converting target rule sets for: assembly\n",
      "🎉 Code generation complete! Output stored in './spider/compiler/assembly/AssemblyParser.hpp'\n"
     ]
    }
   ],
   "source": [
    "print(\"--- Starting C++ Compilation Loop ---\")\n",
    "\n",
    "try:\n",
    "    meta_parser = Lark(iso_ebnf_meta_grammar, parser='lalr')\n",
    "    \n",
    "    for name, target in ebnf_targets.items():\n",
    "        print(f\"Parsing and converting target rule sets for: {name}\")\n",
    "        \n",
    "        # Build the compiler AST tree from your exact text\n",
    "        syntax_tree = meta_parser.parse(target[\"cnt\"])\n",
    "        \n",
    "        # Transform the AST structural nodes into pure C++ Source strings\n",
    "        compiler_transformer = AssemblyCppGenerator()\n",
    "        compiled_cpp_header = compiler_transformer.transform(syntax_tree)\n",
    "        \n",
    "        # Output directly to your destination path\n",
    "        os.makedirs(os.path.dirname(target[\"dst\"]), exist_ok=True)\n",
    "        with open(target[\"dst\"], \"w\", encoding=\"utf-8\") as f:\n",
    "            f.write(compiled_cpp_header)\n",
    "            \n",
    "        print(f\"🎉 Code generation complete! Output stored in '{target['dst']}'\")\n",
    "\n",
    "except Exception as e:\n",
    "    print(f\"❌ Failed to process custom architecture. Error details: \\n{e}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "366688c3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd1aca3f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}