beginning of compiler

This commit is contained in:
2026-06-20 11:53:08 -06:00
parent 5ddecb0c38
commit 9176c4882f
20 changed files with 1784 additions and 0 deletions
+302
View File
@@ -0,0 +1,302 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 95,
"id": "00e26c5b",
"metadata": {},
"outputs": [],
"source": [
"from lark import Lark, Transformer\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "cc16be1a",
"metadata": {},
"outputs": [],
"source": [
"ebnf_targets = {\n",
" \"assembly\": {\n",
" \"src\": \"./samples/assembly.ebnf\",\n",
" \"dst\": \"./spider/compiler/assembly/AssemblyParser.hpp\",\n",
" \"cnt\": None,\n",
" },\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "e88d212f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"--- Loading EBNF Targets ---\n",
"✅ Success [assembly]: Loaded './samples/assembly.ebnf' -> Target destination: './spider/compiler/assembly/AssemblyParser.hpp'\n"
]
}
],
"source": [
"print(\"\\n--- Loading EBNF Targets ---\")\n",
"for target_name, paths in ebnf_targets.items():\n",
" src_path = paths[\"src\"]\n",
" dst_path = paths[\"dst\"]\n",
" \n",
" try:\n",
" with open(src_path, \"r\", encoding=\"utf-8\") as file:\n",
" paths[\"cnt\"] = file.read()\n",
" print(f\"✅ Success [{target_name}]: Loaded '{src_path}' -> Target destination: '{dst_path}'\")\n",
" \n",
" except FileNotFoundError:\n",
" print(f\"❌ Error [{target_name}]: Source file not found at '{src_path}'\")\n"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "e8095002",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from lark import Lark, Transformer\n",
"\n",
"iso_ebnf_meta_grammar = r\"\"\"\n",
" start: rule+\n",
" rule: RULE_NAME \"=\" expression \";\"\n",
" \n",
" ?expression: alternation\n",
" alternation: sequence (\"|\" sequence)*\n",
" \n",
" sequence: item ( [\",\"] item )*\n",
" \n",
" ?item: atom\n",
" | atom \"?\" -> optional\n",
" | atom \"*\" -> repeat\n",
" | \"[\" expression \"]\" -> optional\n",
" | \"{\" expression \"}\" -> repeat\n",
" \n",
" ?atom: RULE_NAME -> call_rule\n",
" | TERMINAL -> match_terminal\n",
" | SPECIAL_SEQ -> handle_special\n",
" | \"(\" expression \")\" -> group\n",
"\n",
" RULE_NAME: /[a-zA-Z_][a-zA-Z0-9_]*/\n",
" TERMINAL: /\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"/ | /'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'/\n",
" SPECIAL_SEQ: /\\?[\\s\\S]*?\\?/\n",
" COMMENT: /\\(\\*([\\s\\S]*?)\\*\\)/\n",
"\n",
" %import common.WS\n",
" %ignore WS\n",
" %ignore COMMENT\n",
"\"\"\"\n",
"\n",
"class AssemblyCppGenerator(Transformer):\n",
" def start(self, rules):\n",
" cpp_functions = \"\\n\\n\".join(rules)\n",
" return f\"\"\"#pragma once\n",
"\n",
"#include <iostream>\n",
"#include <string>\n",
"#include <vector>\n",
"#include <stdexcept>\n",
"\n",
"class AssemblyParser {{\n",
"private:\n",
" std::string src;\n",
" size_t pos = 0;\n",
"\n",
" std::string peek_str(size_t len) {{\n",
" if (pos + len <= src.length()) return src.substr(pos, len);\n",
" return src.substr(pos);\n",
" }}\n",
"\n",
" char peek() {{ return pos < src.length() ? src[pos] : '\\\\0'; }}\n",
" \n",
" void match_char(char expected) {{\n",
" if (peek() == expected) pos++;\n",
" else throw std::runtime_error(\"Unexpected token matching character\");\n",
" }}\n",
"\n",
" void match_string(std::string expected) {{\n",
" if (peek_str(expected.length()) == expected) pos += expected.length();\n",
" else throw std::runtime_error(\"Unexpected token matching string: \" + expected);\n",
" }}\n",
"\n",
" bool isUTF8Alpha() {{ return isalpha(peek()); }}\n",
" bool isWhithespaceCharNotCrLf() {{ return peek() == ' ' || peek() == '\\\\t'; }}\n",
" bool isUTF8CharNotCrLf() {{ return peek() != '\\\\r' && peek() != '\\\\n' && peek() != '\\\\0'; }}\n",
" bool isUTF8CharLitCont() {{ return peek() != '\\'' && peek() != '\\\\\\\\'; }}\n",
" bool isUTF8StringLitCont() {{ return peek() != '\"' && peek() != '\\\\\\\\'; }}\n",
"\n",
"public:\n",
" AssemblyParser(std::string input) : src(input) {{}}\n",
"\n",
" void parse() {{\n",
" parse_program(); \n",
" if (pos < src.length()) throw std::runtime_error(\"Trailing characters left unparsed.\");\n",
" std::cout << \"Assembly source compiled cleanly!\" << std::endl;\n",
" }}\n",
"\n",
"{cpp_functions}\n",
"}};\n",
"\"\"\"\n",
"\n",
" def rule(self, args):\n",
" name, expr = args\n",
" return f\" void parse_{name}() {{\\n{expr}\\n }}\"\n",
"\n",
" # FIX 1: Explicitly handle choice logic using C++ style paths\n",
" def alternation(self, items):\n",
" code_lines = []\n",
" for i, item in enumerate(items):\n",
" # Clean up padding whitespace if any\n",
" clean_item = str(item).strip()\n",
" if not clean_item: continue\n",
" \n",
" # Since lookahead processing requires FIRST sets, we scaffold a sequential fallback\n",
" if i == 0:\n",
" code_lines.append(f\" if (/* option {i+1} */ true) {{\\n {clean_item}\\n }}\")\n",
" else:\n",
" code_lines.append(f\" else if (/* option {i+1} */ true) {{\\n {clean_item}\\n }}\")\n",
" return \"\\n\".join(code_lines)\n",
"\n",
" def sequence(self, items):\n",
" flattened_items = []\n",
" for item in items:\n",
" if isinstance(item, list):\n",
" for sub_item in item:\n",
" if sub_item: flattened_items.append(str(sub_item).strip())\n",
" elif item:\n",
" flattened_items.append(str(item).strip())\n",
" return \"\\n\".join(f\" {item}\" for item in flattened_items if item)\n",
"\n",
" def call_rule(self, token):\n",
" rule_name = token[0].value if isinstance(token, list) else token.value\n",
" return f\"parse_{rule_name}();\"\n",
"\n",
" # FIX 2: Generate match_string instead of match_char for multi-char string keywords like \"include\"\n",
" def match_terminal(self, token):\n",
" raw_token_str = token[0].value if isinstance(token, list) else token.value\n",
" raw_val = raw_token_str[1:-1]\n",
" \n",
" if raw_val == r\"\\r\": return \"match_char('\\\\r');\"\n",
" if raw_val == r\"\\n\": return \"match_char('\\\\n');\"\n",
" if raw_val == r\"\\t\": return \"match_char('\\\\t');\"\n",
" if raw_val == r\"\\\\\": return \"match_char('\\\\\\\\');\"\n",
" if not raw_val: return \"// Empty string match\"\n",
" \n",
" if len(raw_val) > 1:\n",
" return f\"match_string(\\\"{raw_val}\\\");\"\n",
" return f\"match_char('{raw_val}');\"\n",
"\n",
" def handle_special(self, token):\n",
" raw_string = token[0].value if isinstance(token, list) else token.value\n",
" func_name = raw_string.strip('?').strip()\n",
" return f\"if ({func_name}()) {{ pos++; }} else {{ throw std::runtime_error(\\\"Failed validation for {func_name}\\\"); }}\"\n",
"\n",
" def optional(self, args):\n",
" content = args[0] if not isinstance(args[0], list) else \"\\n \".join(args[0])\n",
" return f\"// Optional block\\n if (/* lookahead check */ true) {{\\n {content}\\n }}\"\n",
"\n",
" def repeat(self, args):\n",
" content = args[0] if not isinstance(args[0], list) else \"\\n \".join(args[0])\n",
" return f\"// Repeat block\\n while (/* lookahead check */ true) {{\\n {content}\\n }}\"\n",
"\n",
" def group(self, args):\n",
" # Flatten grouped elements cleanly to strings\n",
" if isinstance(args, list):\n",
" return \"\\n\".join(str(x) for x in args)\n",
" return str(args)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "558915ff",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- Starting C++ Compilation Loop ---\n",
"Parsing and converting target rule sets for: assembly\n",
"🎉 Code generation complete! Output stored in './spider/compiler/assembly/AssemblyParser.hpp'\n"
]
}
],
"source": [
"print(\"--- Starting C++ Compilation Loop ---\")\n",
"\n",
"try:\n",
" meta_parser = Lark(iso_ebnf_meta_grammar, parser='lalr')\n",
" \n",
" for name, target in ebnf_targets.items():\n",
" print(f\"Parsing and converting target rule sets for: {name}\")\n",
" \n",
" # Build the compiler AST tree from your exact text\n",
" syntax_tree = meta_parser.parse(target[\"cnt\"])\n",
" \n",
" # Transform the AST structural nodes into pure C++ Source strings\n",
" compiler_transformer = AssemblyCppGenerator()\n",
" compiled_cpp_header = compiler_transformer.transform(syntax_tree)\n",
" \n",
" # Output directly to your destination path\n",
" os.makedirs(os.path.dirname(target[\"dst\"]), exist_ok=True)\n",
" with open(target[\"dst\"], \"w\", encoding=\"utf-8\") as f:\n",
" f.write(compiled_cpp_header)\n",
" \n",
" print(f\"🎉 Code generation complete! Output stored in '{target['dst']}'\")\n",
"\n",
"except Exception as e:\n",
" print(f\"❌ Failed to process custom architecture. Error details: \\n{e}\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "366688c3",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd1aca3f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}