beginning of compiler

2026-06-20 11:53:08 -06:00
parent 5ddecb0c38
commit 9176c4882f
20 changed files with 1784 additions and 0 deletions
@@ -0,0 +1,6 @@
 # For now, ignore user builds
 # We will eventually change to a custom
 # build system.
 # So hold on
 /bin
 /out
@@ -0,0 +1,302 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "00e26c5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from lark import Lark, Transformer\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "cc16be1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "ebnf_targets = {\n",
    "    \"assembly\": {\n",
    "        \"src\": \"./samples/assembly.ebnf\",\n",
    "        \"dst\": \"./spider/compiler/assembly/AssemblyParser.hpp\",\n",
    "        \"cnt\": None,\n",
    "    },\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "e88d212f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "--- Loading EBNF Targets ---\n",
      "✅ Success [assembly]: Loaded './samples/assembly.ebnf' -> Target destination: './spider/compiler/assembly/AssemblyParser.hpp'\n"
     ]
    }
   ],
   "source": [
    "print(\"\\n--- Loading EBNF Targets ---\")\n",
    "for target_name, paths in ebnf_targets.items():\n",
    "    src_path = paths[\"src\"]\n",
    "    dst_path = paths[\"dst\"]\n",
    "    \n",
    "    try:\n",
    "        with open(src_path, \"r\", encoding=\"utf-8\") as file:\n",
    "            paths[\"cnt\"] = file.read()\n",
    "            print(f\"✅ Success [{target_name}]: Loaded '{src_path}' -> Target destination: '{dst_path}'\")\n",
    "        \n",
    "    except FileNotFoundError:\n",
    "        print(f\"❌ Error [{target_name}]: Source file not found at '{src_path}'\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "e8095002",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from lark import Lark, Transformer\n",
    "\n",
    "iso_ebnf_meta_grammar = r\"\"\"\n",
    "    start: rule+\n",
    "    rule: RULE_NAME \"=\" expression \";\"\n",
    "    \n",
    "    ?expression: alternation\n",
    "    alternation: sequence (\"|\" sequence)*\n",
    "    \n",
    "    sequence: item ( [\",\"] item )*\n",
    "    \n",
    "    ?item: atom\n",
    "         | atom \"?\" -> optional\n",
    "         | atom \"*\" -> repeat\n",
    "         | \"[\" expression \"]\" -> optional\n",
    "         | \"{\" expression \"}\" -> repeat\n",
    "         \n",
    "    ?atom: RULE_NAME -> call_rule\n",
    "         | TERMINAL -> match_terminal\n",
    "         | SPECIAL_SEQ -> handle_special\n",
    "         | \"(\" expression \")\" -> group\n",
    "\n",
    "    RULE_NAME: /[a-zA-Z_][a-zA-Z0-9_]*/\n",
    "    TERMINAL: /\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"/ | /'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'/\n",
    "    SPECIAL_SEQ: /\\?[\\s\\S]*?\\?/\n",
    "    COMMENT: /\\(\\*([\\s\\S]*?)\\*\\)/\n",
    "\n",
    "    %import common.WS\n",
    "    %ignore WS\n",
    "    %ignore COMMENT\n",
    "\"\"\"\n",
    "\n",
    "class AssemblyCppGenerator(Transformer):\n",
    "    def start(self, rules):\n",
    "        cpp_functions = \"\\n\\n\".join(rules)\n",
    "        return f\"\"\"#pragma once\n",
    "\n",
    "#include <iostream>\n",
    "#include <string>\n",
    "#include <vector>\n",
    "#include <stdexcept>\n",
    "\n",
    "class AssemblyParser {{\n",
    "private:\n",
    "    std::string src;\n",
    "    size_t pos = 0;\n",
    "\n",
    "    std::string peek_str(size_t len) {{\n",
    "        if (pos + len <= src.length()) return src.substr(pos, len);\n",
    "        return src.substr(pos);\n",
    "    }}\n",
    "\n",
    "    char peek() {{ return pos < src.length() ? src[pos] : '\\\\0'; }}\n",
    "    \n",
    "    void match_char(char expected) {{\n",
    "        if (peek() == expected) pos++;\n",
    "        else throw std::runtime_error(\"Unexpected token matching character\");\n",
    "    }}\n",
    "\n",
    "    void match_string(std::string expected) {{\n",
    "        if (peek_str(expected.length()) == expected) pos += expected.length();\n",
    "        else throw std::runtime_error(\"Unexpected token matching string: \" + expected);\n",
    "    }}\n",
    "\n",
    "    bool isUTF8Alpha() {{ return isalpha(peek()); }}\n",
    "    bool isWhithespaceCharNotCrLf() {{ return peek() == ' ' || peek() == '\\\\t'; }}\n",
    "    bool isUTF8CharNotCrLf() {{ return peek() != '\\\\r' && peek() != '\\\\n' && peek() != '\\\\0'; }}\n",
    "    bool isUTF8CharLitCont() {{ return peek() != '\\'' && peek() != '\\\\\\\\'; }}\n",
    "    bool isUTF8StringLitCont() {{ return peek() != '\"' && peek() != '\\\\\\\\'; }}\n",
    "\n",
    "public:\n",
    "    AssemblyParser(std::string input) : src(input) {{}}\n",
    "\n",
    "    void parse() {{\n",
    "        parse_program(); \n",
    "        if (pos < src.length()) throw std::runtime_error(\"Trailing characters left unparsed.\");\n",
    "        std::cout << \"Assembly source compiled cleanly!\" << std::endl;\n",
    "    }}\n",
    "\n",
    "{cpp_functions}\n",
    "}};\n",
    "\"\"\"\n",
    "\n",
    "    def rule(self, args):\n",
    "        name, expr = args\n",
    "        return f\"    void parse_{name}() {{\\n{expr}\\n    }}\"\n",
    "\n",
    "    # FIX 1: Explicitly handle choice logic using C++ style paths\n",
    "    def alternation(self, items):\n",
    "        code_lines = []\n",
    "        for i, item in enumerate(items):\n",
    "            # Clean up padding whitespace if any\n",
    "            clean_item = str(item).strip()\n",
    "            if not clean_item: continue\n",
    "            \n",
    "            # Since lookahead processing requires FIRST sets, we scaffold a sequential fallback\n",
    "            if i == 0:\n",
    "                code_lines.append(f\"        if (/* option {i+1} */ true) {{\\n    {clean_item}\\n        }}\")\n",
    "            else:\n",
    "                code_lines.append(f\"        else if (/* option {i+1} */ true) {{\\n    {clean_item}\\n        }}\")\n",
    "        return \"\\n\".join(code_lines)\n",
    "\n",
    "    def sequence(self, items):\n",
    "        flattened_items = []\n",
    "        for item in items:\n",
    "            if isinstance(item, list):\n",
    "                for sub_item in item:\n",
    "                    if sub_item: flattened_items.append(str(sub_item).strip())\n",
    "            elif item:\n",
    "                flattened_items.append(str(item).strip())\n",
    "        return \"\\n\".join(f\"        {item}\" for item in flattened_items if item)\n",
    "\n",
    "    def call_rule(self, token):\n",
    "        rule_name = token[0].value if isinstance(token, list) else token.value\n",
    "        return f\"parse_{rule_name}();\"\n",
    "\n",
    "    # FIX 2: Generate match_string instead of match_char for multi-char string keywords like \"include\"\n",
    "    def match_terminal(self, token):\n",
    "        raw_token_str = token[0].value if isinstance(token, list) else token.value\n",
    "        raw_val = raw_token_str[1:-1]\n",
    "        \n",
    "        if raw_val == r\"\\r\": return \"match_char('\\\\r');\"\n",
    "        if raw_val == r\"\\n\": return \"match_char('\\\\n');\"\n",
    "        if raw_val == r\"\\t\": return \"match_char('\\\\t');\"\n",
    "        if raw_val == r\"\\\\\": return \"match_char('\\\\\\\\');\"\n",
    "        if not raw_val: return \"// Empty string match\"\n",
    "        \n",
    "        if len(raw_val) > 1:\n",
    "            return f\"match_string(\\\"{raw_val}\\\");\"\n",
    "        return f\"match_char('{raw_val}');\"\n",
    "\n",
    "    def handle_special(self, token):\n",
    "        raw_string = token[0].value if isinstance(token, list) else token.value\n",
    "        func_name = raw_string.strip('?').strip()\n",
    "        return f\"if ({func_name}()) {{ pos++; }} else {{ throw std::runtime_error(\\\"Failed validation for {func_name}\\\"); }}\"\n",
    "\n",
    "    def optional(self, args):\n",
    "        content = args[0] if not isinstance(args[0], list) else \"\\n        \".join(args[0])\n",
    "        return f\"// Optional block\\n        if (/* lookahead check */ true) {{\\n    {content}\\n        }}\"\n",
    "\n",
    "    def repeat(self, args):\n",
    "        content = args[0] if not isinstance(args[0], list) else \"\\n        \".join(args[0])\n",
    "        return f\"// Repeat block\\n        while (/* lookahead check */ true) {{\\n    {content}\\n        }}\"\n",
    "\n",
    "    def group(self, args):\n",
    "        # Flatten grouped elements cleanly to strings\n",
    "        if isinstance(args, list):\n",
    "            return \"\\n\".join(str(x) for x in args)\n",
    "        return str(args)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "558915ff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--- Starting C++ Compilation Loop ---\n",
      "Parsing and converting target rule sets for: assembly\n",
      "🎉 Code generation complete! Output stored in './spider/compiler/assembly/AssemblyParser.hpp'\n"
     ]
    }
   ],
   "source": [
    "print(\"--- Starting C++ Compilation Loop ---\")\n",
    "\n",
    "try:\n",
    "    meta_parser = Lark(iso_ebnf_meta_grammar, parser='lalr')\n",
    "    \n",
    "    for name, target in ebnf_targets.items():\n",
    "        print(f\"Parsing and converting target rule sets for: {name}\")\n",
    "        \n",
    "        # Build the compiler AST tree from your exact text\n",
    "        syntax_tree = meta_parser.parse(target[\"cnt\"])\n",
    "        \n",
    "        # Transform the AST structural nodes into pure C++ Source strings\n",
    "        compiler_transformer = AssemblyCppGenerator()\n",
    "        compiled_cpp_header = compiler_transformer.transform(syntax_tree)\n",
    "        \n",
    "        # Output directly to your destination path\n",
    "        os.makedirs(os.path.dirname(target[\"dst\"]), exist_ok=True)\n",
    "        with open(target[\"dst\"], \"w\", encoding=\"utf-8\") as f:\n",
    "            f.write(compiled_cpp_header)\n",
    "            \n",
    "        print(f\"🎉 Code generation complete! Output stored in '{target['dst']}'\")\n",
    "\n",
    "except Exception as e:\n",
    "    print(f\"❌ Failed to process custom architecture. Error details: \\n{e}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "366688c3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd1aca3f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
@@ -0,0 +1,76 @@
 (* Spider Assembly EBNF | Sintek Analytics @ 2026 | All Rights Reserved *)
 (* Characters & Structures *)
 letter          = ? isUTF8Alpha ? ;
 digit            = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
 alpha_num_char   = letter | digit ;
 hex_digit        = digit | "A" | "B" | "C" | "D" | "E" | "F" | "a" | "b" | "c" | "d" | "e" | "f" ;
 octal_digit      = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" ;
 binary_digit     = "0" | "1" ;
 ws_char          = ? isWhithespaceCharNotCrLf ? ;
 ws_optional      = { ws_char } ;
 whitespace       = ws_char , { ws_char } ;
 newline          = "\r" | "\n" | "\r\n" ;
 utf8_char        = ? isUTF8CharNotCrLf ? ;
 char_escape      = "\\", utf8_char ;
 char_content     = char_escape | ? isUTF8CharLitCont ? ; (* Not ' or \ *)
 char_lit         = "'", char_content, "'" ;
 string_char      = char_escape | ? isUTF8StringLitCont ? ; (* Not " or \ *)
 string_lit       = '"', { string_char }, '"' ;
 (* Literals *)
 identifier       = ( letter | "_" ) , { alpha_num_char | "_" } ;
 comment          = ";" , { utf8_char } ;
 sign             = "+" | "-" ;
 exponent_marker  = "e" | "E" ;
 exponent         = exponent_marker , [ sign ] , digit , { digit } ;
 decimal_lit      = [ sign ] , digit , { digit } , [ "B" | "S" | "I" | "L" ] ;
 float_lit        = [ sign ] , ( 
                      ( digit , { digit } , "." , digit , { digit } , [ exponent ] ) | 
                      ( "." , digit , { digit } , [ exponent ] ) | 
                      ( digit , { digit } , exponent ) 
                    ) , [ "F" | "D" ] ;
 hex_lit          = [ sign ] , "0x" , hex_digit , { hex_digit } ;
 octal_lit        = [ sign ] , "0c" , octal_digit , { octal_digit } ;
 binary_lit       = [ sign ] , "0b" , binary_digit , { binary_digit } ;
 literal          = decimal_lit | float_lit | hex_lit | octal_lit | binary_lit | string_lit | char_lit ;
 literal_cast     = ("B" | "S" | "I" | "L" | "F" | "D"), ws_optional, "(", ws_optional, literal, ws_optional, ")" ;
 literal_decl     = literal | literal_cast ;
 (* Operands *)
 register         = "R" , alpha_num_char , alpha_num_char ;
 addrm_ind        = "[", ws_optional, literal_decl, ws_optional, "]" ;
 addrm_ptr        = "[", ws_optional, register, ws_optional, "]" ;
 addrm_idx        = "[", ws_optional, register, ws_optional, "+", ws_optional, literal_decl, ws_optional, "]";
 addrm_sca        = "[", ws_optional, register, ws_optional, "+", register, ws_optional, "*", ws_optional, literal_decl, ws_optional, "]";
 addrm_dis        = "[", ws_optional, register, ws_optional, "+", register, ws_optional, "*", ws_optional, literal_decl, ws_optional, "+", ws_optional, literal_decl, ws_optional, "]";
 addr_modes       = addrm_ind | addrm_ptr | addrm_idx | addrm_sca | addrm_dis ;
 operand          = register | identifier | literal_decl | addr_modes ;
 (* Generalized Instructions *)
 opcode           = letter , { alpha_num_char } ;
 operand_list     = operand , { "," , ws_optional , operand } ;
 instruction      = opcode , [ whitespace , operand_list ] ;
 (* Added Preprocessor, Sections, and Metadata Syntaxes *)
 include_decl     = "include", whitespace, string_lit ;
 annotation_oper  = identifier, [ ws_optional, "=", ws_optional, literal_decl ] ;
 annotation_ops   = annotation_oper , { ws_optional, "," , ws_optional , annotation_oper } ;
 annotation_args  = "(", ws_optional, annotation_ops, ws_optional, ")" ;
 annotation       = "@", identifier, [ annotation_args ] ;
 section_decl     = "section", whitespace, ".", identifier ;
 (* Line Structure *)
 label            = identifier, ":" ;
 line_content     = include_decl | section_decl | ( [ annotation, whitespace ], [ label, ws_optional ], [ instruction ] ) ;
 line             = ws_optional, [ line_content ], ws_optional , [ comment ] , newline ;
 line_last        = ws_optional, [ line_content ], ws_optional , [ comment ] ;
 program          = { line }, [ line_last ] ;
@@ -0,0 +1,11 @@
@asm
 .data
 .code
 MOV RA, 1
 MOV RB, 8 ; Input number
 :loop_start
 MUL RA, RB
 NOT RB ; RB != 0? Updates equal flag
 DEC RB ; RB -= 1
 JEQ loop_start ; If equal flag, goto loop_start
 ; End program, result in RA
@@ -0,0 +1,21 @@
 {
 	"folders": [
 		{
 			"path": "."
 		}
 	],
 	"settings": {
 		"gitlens.remotes": [
 			{
 				"domain": "git.sintekanalytics.com",
 				"type": "Gitea",
 				"name": "Sintek Analytics' Git",
 				"protocol": "https",
 			}
 		],
 		"C_Cpp.default.includePath": [
 			"./src"
 		],
 		"terminal.integrated.defaultProfile.windows": "MSYS2 UCRT"
 	}
 }
@@ -0,0 +1,812 @@
 #pragma once
 #include <iostream>
 #include <string>
 #include <vector>
 #include <stdexcept>
 class AssemblyParser {
 private:
    std::string src;
    size_t pos = 0;
    std::string peek_str(size_t len) {
        if (pos + len <= src.length()) return src.substr(pos, len);
        return src.substr(pos);
    }
    char peek() { return pos < src.length() ? src[pos] : '\0'; }
    void match_char(char expected) {
        if (peek() == expected) pos++;
        else throw std::runtime_error("Unexpected token matching character");
    }
    void match_string(std::string expected) {
        if (peek_str(expected.length()) == expected) pos += expected.length();
        else throw std::runtime_error("Unexpected token matching string: " + expected);
    }
    bool isUTF8Alpha() { return isalpha(peek()); }
    bool isWhithespaceCharNotCrLf() { return peek() == ' ' || peek() == '\t'; }
    bool isUTF8CharNotCrLf() { return peek() != '\r' && peek() != '\n' && peek() != '\0'; }
    bool isUTF8CharLitCont() { return peek() != '\'' && peek() != '\\'; }
    bool isUTF8StringLitCont() { return peek() != '"' && peek() != '\\'; }
 public:
    AssemblyParser(std::string input) : src(input) {}
    void parse() {
        parse_program();
        if (pos < src.length()) throw std::runtime_error("Trailing characters left unparsed.");
        std::cout << "Assembly source compiled cleanly!" << std::endl;
    }
    void parse_letter() {
        if (/* option 1 */ true) {
            if (isUTF8Alpha()) { pos++; } else { throw std::runtime_error("Failed validation for isUTF8Alpha"); }
        }
    }
    void parse_digit() {
        if (/* option 1 */ true) {
            match_char('0');
        } else if (/* option 2 */ true) {
            match_char('1');
        } else if (/* option 3 */ true) {
            match_char('2');
        } else if (/* option 4 */ true) {
            match_char('3');
        } else if (/* option 5 */ true) {
            match_char('4');
        } else if (/* option 6 */ true) {
            match_char('5');
        } else if (/* option 7 */ true) {
            match_char('6');
        } else if (/* option 8 */ true) {
            match_char('7');
        } else if (/* option 9 */ true) {
            match_char('8');
        } else if (/* option 10 */ true) {
            match_char('9');
        }
    }
    void parse_alpha_num_char() {
        if (/* option 1 */ true) {
            parse_letter();
        } else if (/* option 2 */ true) {
            parse_digit();
        }
    }
    void parse_hex_digit() {
        if (/* option 1 */ true) {
            parse_digit();
        } else if (/* option 2 */ true) {
            match_char('A');
        } else if (/* option 3 */ true) {
            match_char('B');
        } else if (/* option 4 */ true) {
            match_char('C');
        } else if (/* option 5 */ true) {
            match_char('D');
        } else if (/* option 6 */ true) {
            match_char('E');
        } else if (/* option 7 */ true) {
            match_char('F');
        } else if (/* option 8 */ true) {
            match_char('a');
        } else if (/* option 9 */ true) {
            match_char('b');
        } else if (/* option 10 */ true) {
            match_char('c');
        } else if (/* option 11 */ true) {
            match_char('d');
        } else if (/* option 12 */ true) {
            match_char('e');
        } else if (/* option 13 */ true) {
            match_char('f');
        }
    }
    void parse_octal_digit() {
        if (/* option 1 */ true) {
            match_char('0');
        } else if (/* option 2 */ true) {
            match_char('1');
        } else if (/* option 3 */ true) {
            match_char('2');
        } else if (/* option 4 */ true) {
            match_char('3');
        } else if (/* option 5 */ true) {
            match_char('4');
        } else if (/* option 6 */ true) {
            match_char('5');
        } else if (/* option 7 */ true) {
            match_char('6');
        } else if (/* option 8 */ true) {
            match_char('7');
        }
    }
    void parse_binary_digit() {
        if (/* option 1 */ true) {
            match_char('0');
        } else if (/* option 2 */ true) {
            match_char('1');
        }
    }
    void parse_ws_char() {
        if (/* option 1 */ true) {
            if (isWhithespaceCharNotCrLf()) { pos++; } else { throw std::runtime_error("Failed validation for isWhithespaceCharNotCrLf"); }
        }
    }
    void parse_ws_optional() {
        if (/* option 1 */ true) {
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_ws_char();
                }
            }
        }
    }
    void parse_whitespace() {
        if (/* option 1 */ true) {
            parse_ws_char();
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_ws_char();
                }
            }
        }
    }
    void parse_newline() {
        if (/* option 1 */ true) {
            match_char('\r');
        } else if (/* option 2 */ true) {
            match_char('\n');
        } else if (/* option 3 */ true) {
            match_string("\r\n");
        }
    }
    void parse_utf8_char() {
        if (/* option 1 */ true) {
            if (isUTF8CharNotCrLf()) { pos++; } else { throw std::runtime_error("Failed validation for isUTF8CharNotCrLf"); }
        }
    }
    void parse_char_escape() {
        if (/* option 1 */ true) {
            match_char('\\');
            parse_utf8_char();
        }
    }
    void parse_char_content() {
        if (/* option 1 */ true) {
            parse_char_escape();
        } else if (/* option 2 */ true) {
            if (isUTF8CharLitCont()) { pos++; } else { throw std::runtime_error("Failed validation for isUTF8CharLitCont"); }
        }
    }
    void parse_char_lit() {
        if (/* option 1 */ true) {
            match_char('\'');
            parse_char_content();
            match_char('\'');
        }
    }
    void parse_string_char() {
        if (/* option 1 */ true) {
            parse_char_escape();
        } else if (/* option 2 */ true) {
            if (isUTF8StringLitCont()) { pos++; } else { throw std::runtime_error("Failed validation for isUTF8StringLitCont"); }
        }
    }
    void parse_string_lit() {
        if (/* option 1 */ true) {
            match_char('"');
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_string_char();
                }
            }
            match_char('"');
        }
    }
    void parse_identifier() {
        if (/* option 1 */ true) {
            if (/* option 1 */ true) {
                parse_letter();
            } else if (/* option 2 */ true) {
                match_char('_');
            }
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_alpha_num_char();
                } else if (/* option 2 */ true) {
                    match_char('_');
                }
            }
        }
    }
    void parse_comment() {
        if (/* option 1 */ true) {
            match_char(';');
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_utf8_char();
                }
            }
        }
    }
    void parse_sign() {
        if (/* option 1 */ true) {
            match_char('+');
        } else if (/* option 2 */ true) {
            match_char('-');
        }
    }
    void parse_exponent_marker() {
        if (/* option 1 */ true) {
            match_char('e');
        } else if (/* option 2 */ true) {
            match_char('E');
        }
    }
    void parse_exponent() {
        if (/* option 1 */ true) {
            parse_exponent_marker();
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_sign();
                }
            }
            parse_digit();
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_digit();
                }
            }
        }
    }
    void parse_decimal_lit() {
        if (/* option 1 */ true) {
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_sign();
                }
            }
            parse_digit();
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_digit();
                }
            }
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    match_char('B');
                } else if (/* option 2 */ true) {
                    match_char('S');
                } else if (/* option 3 */ true) {
                    match_char('I');
                } else if (/* option 4 */ true) {
                    match_char('L');
                }
            }
        }
    }
    void parse_float_lit() {
        if (/* option 1 */ true) {
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_sign();
                }
            }
            if (/* option 1 */ true) {
                if (/* option 1 */ true) {
                    parse_digit();
                    // Repeat block
                    while (/* lookahead check */ true) {
                        if (/* option 1 */ true) {
                            parse_digit();
                        }
                    }
                    match_char('.');
                    parse_digit();
                    // Repeat block
                    while (/* lookahead check */ true) {
                        if (/* option 1 */ true) {
                            parse_digit();
                        }
                    }
                    // Optional block
                    if (/* lookahead check */ true) {
                        if (/* option 1 */ true) {
                            parse_exponent();
                        }
                    }
                }
            } else if (/* option 2 */ true) {
                if (/* option 1 */ true) {
                    match_char('.');
                    parse_digit();
                    // Repeat block
                    while (/* lookahead check */ true) {
                        if (/* option 1 */ true) {
                            parse_digit();
                        }
                    }
                    // Optional block
                    if (/* lookahead check */ true) {
                        if (/* option 1 */ true) {
                            parse_exponent();
                        }
                    }
                }
            } else if (/* option 3 */ true) {
                if (/* option 1 */ true) {
                    parse_digit();
                    // Repeat block
                    while (/* lookahead check */ true) {
                        if (/* option 1 */ true) {
                            parse_digit();
                        }
                    }
                    parse_exponent();
                }
            }
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    match_char('F');
                } else if (/* option 2 */ true) {
                    match_char('D');
                }
            }
        }
    }
    void parse_hex_lit() {
        if (/* option 1 */ true) {
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_sign();
                }
            }
            match_string("0x");
            parse_hex_digit();
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_hex_digit();
                }
            }
        }
    }
    void parse_octal_lit() {
        if (/* option 1 */ true) {
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_sign();
                }
            }
            match_string("0c");
            parse_octal_digit();
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_octal_digit();
                }
            }
        }
    }
    void parse_binary_lit() {
        if (/* option 1 */ true) {
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_sign();
                }
            }
            match_string("0b");
            parse_binary_digit();
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_binary_digit();
                }
            }
        }
    }
    void parse_literal() {
        if (/* option 1 */ true) {
            parse_decimal_lit();
        } else if (/* option 2 */ true) {
            parse_float_lit();
        } else if (/* option 3 */ true) {
            parse_hex_lit();
        } else if (/* option 4 */ true) {
            parse_octal_lit();
        } else if (/* option 5 */ true) {
            parse_binary_lit();
        } else if (/* option 6 */ true) {
            parse_string_lit();
        } else if (/* option 7 */ true) {
            parse_char_lit();
        }
    }
    void parse_literal_cast() {
        if (/* option 1 */ true) {
            if (/* option 1 */ true) {
                match_char('B');
            } else if (/* option 2 */ true) {
                match_char('S');
            } else if (/* option 3 */ true) {
                match_char('I');
            } else if (/* option 4 */ true) {
                match_char('L');
            } else if (/* option 5 */ true) {
                match_char('F');
            } else if (/* option 6 */ true) {
                match_char('D');
            }
            parse_ws_optional();
            match_char('(');
            parse_ws_optional();
            parse_literal();
            parse_ws_optional();
            match_char(')');
        }
    }
    void parse_literal_decl() {
        if (/* option 1 */ true) {
            parse_literal();
        } else if (/* option 2 */ true) {
            parse_literal_cast();
        }
    }
    void parse_register() {
        if (/* option 1 */ true) {
            match_char('R');
            parse_alpha_num_char();
            parse_alpha_num_char();
        }
    }
    void parse_addrm_ind() {
        if (/* option 1 */ true) {
            match_char('[');
            parse_ws_optional();
            parse_literal_decl();
            parse_ws_optional();
            match_char(']');
        }
    }
    void parse_addrm_ptr() {
        if (/* option 1 */ true) {
            match_char('[');
            parse_ws_optional();
            parse_register();
            parse_ws_optional();
            match_char(']');
        }
    }
    void parse_addrm_idx() {
        if (/* option 1 */ true) {
            match_char('[');
            parse_ws_optional();
            parse_register();
            parse_ws_optional();
            match_char('+');
            parse_ws_optional();
            parse_literal_decl();
            parse_ws_optional();
            match_char(']');
        }
    }
    void parse_addrm_sca() {
        if (/* option 1 */ true) {
            match_char('[');
            parse_ws_optional();
            parse_register();
            parse_ws_optional();
            match_char('+');
            parse_register();
            parse_ws_optional();
            match_char('*');
            parse_ws_optional();
            parse_literal_decl();
            parse_ws_optional();
            match_char(']');
        }
    }
    void parse_addrm_dis() {
        if (/* option 1 */ true) {
            match_char('[');
            parse_ws_optional();
            parse_register();
            parse_ws_optional();
            match_char('+');
            parse_register();
            parse_ws_optional();
            match_char('*');
            parse_ws_optional();
            parse_literal_decl();
            parse_ws_optional();
            match_char('+');
            parse_ws_optional();
            parse_literal_decl();
            parse_ws_optional();
            match_char(']');
        }
    }
    void parse_addr_modes() {
        if (/* option 1 */ true) {
            parse_addrm_ind();
        } else if (/* option 2 */ true) {
            parse_addrm_ptr();
        } else if (/* option 3 */ true) {
            parse_addrm_idx();
        } else if (/* option 4 */ true) {
            parse_addrm_sca();
        } else if (/* option 5 */ true) {
            parse_addrm_dis();
        }
    }
    void parse_operand() {
        if (/* option 1 */ true) {
            parse_register();
        } else if (/* option 2 */ true) {
            parse_identifier();
        } else if (/* option 3 */ true) {
            parse_literal_decl();
        } else if (/* option 4 */ true) {
            parse_addr_modes();
        }
    }
    void parse_opcode() {
        if (/* option 1 */ true) {
            parse_letter();
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_alpha_num_char();
                }
            }
        }
    }
    void parse_operand_list() {
        if (/* option 1 */ true) {
            parse_operand();
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    match_char(',');
                    parse_ws_optional();
                    parse_operand();
                }
            }
        }
    }
    void parse_instruction() {
        if (/* option 1 */ true) {
            parse_opcode();
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_whitespace();
                    parse_operand_list();
                }
            }
        }
    }
    void parse_include_decl() {
        if (/* option 1 */ true) {
            match_string("include");
            parse_whitespace();
            parse_string_lit();
        }
    }
    void parse_annotation_oper() {
        if (/* option 1 */ true) {
            parse_identifier();
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_ws_optional();
                    match_char('=');
                    parse_ws_optional();
                    parse_literal_decl();
                }
            }
        }
    }
    void parse_annotation_ops() {
        if (/* option 1 */ true) {
            parse_annotation_oper();
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_ws_optional();
                    match_char(',');
                    parse_ws_optional();
                    parse_annotation_oper();
                }
            }
        }
    }
    void parse_annotation_args() {
        if (/* option 1 */ true) {
            match_char('(');
            parse_ws_optional();
            parse_annotation_ops();
            parse_ws_optional();
            match_char(')');
        }
    }
    void parse_annotation() {
        if (/* option 1 */ true) {
            match_char('@');
            parse_identifier();
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_annotation_args();
                }
            }
        }
    }
    void parse_section_decl() {
        if (/* option 1 */ true) {
            match_string("section");
            parse_whitespace();
            match_char('.');
            parse_identifier();
        }
    }
    void parse_label() {
        if (/* option 1 */ true) {
            parse_identifier();
            match_char(':');
        }
    }
    void parse_line_content() {
        if (/* option 1 */ true) {
            parse_include_decl();
        } else if (/* option 2 */ true) {
            parse_section_decl();
        } else if (/* option 3 */ true) {
            if (/* option 1 */ true) {
                // Optional block
                if (/* lookahead check */ true) {
                    if (/* option 1 */ true) {
                        parse_annotation();
                        parse_whitespace();
                    }
                }
                // Optional block
                if (/* lookahead check */ true) {
                    if (/* option 1 */ true) {
                        parse_label();
                        parse_ws_optional();
                    }
                }
                // Optional block
                if (/* lookahead check */ true) {
                    if (/* option 1 */ true) {
                        parse_instruction();
                    }
                }
            }
        }
    }
    void parse_line() {
        if (/* option 1 */ true) {
            parse_ws_optional();
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_line_content();
                }
            }
            parse_ws_optional();
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_comment();
                }
            }
            parse_newline();
        }
    }
    void parse_line_last() {
        if (/* option 1 */ true) {
            parse_ws_optional();
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_line_content();
                }
            }
            parse_ws_optional();
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_comment();
                }
            }
        }
    }
    void parse_program() {
        if (/* option 1 */ true) {
            // Repeat block
            while (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_line();
                }
            }
            // Optional block
            if (/* lookahead check */ true) {
                if (/* option 1 */ true) {
                    parse_line_last();
                }
            }
        }
    }
 };
@@ -0,0 +1,10 @@
 #pragma
 #include <spider/compiler/common.hpp>
 namespace spider {
    class Token;
    class RootToken;
 }
@@ -0,0 +1,35 @@
 #include "Assembler.hpp"
 namespace spider {
    Assembler::Assembler() {}
    Assembler::~Assembler() {}
    Assembler::Error Assembler::loadFile(const fs::path& path) {
        // check if path exists
        fs::path abs_path = fs::canonical(path);
        if(!fs::exists(abs_path)) return Error::FILE_NOT_FOUND;
        // check if recursive
        if(fstack.contains(abs_path)) return Error::FILE_RECURSIVE_LOAD;
        auto ir = fstack.insert(abs_path);
        // Actually load!
        levels.emplace_back(Level {
            .reader = std::make_unique<TextReader>(new FileTextReader(abs_path.string())),
            .source = abs_path.string(),
        });
        parseCurrentLevel();
        // alright!
        fstack.erase(ir.first);
        return Error::SUCCESS;
    }
    void Assembler::parseCurrentLevel() {
        auto& lvl = levels.back();
    }
 }
@@ -0,0 +1,52 @@
 #pragma once
 #include <spider/compiler/common.hpp>
 #include <spider/compiler/text/TextReader.hpp>
 #include <spider/compiler/tokens/RootToken.hpp>
 namespace spider {
    /**
     * The spider assembler, capable of
     * converting text into bytecode.
     */
    class Assembler {
    public:
        enum class Error {
            SUCCESS,
            FILE_NOT_FOUND, FILE_RECURSIVE_LOAD,
        };
        struct Level {
            uptr<TextReader> reader;
            RootToken root;
            std::string source;
        };
    public:
        set<fs::path> fstack;
        deque<Level> levels;
    public:
        Assembler();
        ~Assembler();
    public:
        /**
         * Attempts to load a file, fails if it
         * doesn't exist.
         */
        Error loadFile(const fs::path& path);
    private:
        void parseCurrentLevel();
    };
 }
@@ -0,0 +1,11 @@
 #pragma once
 namespace spider {
    /**
     * A disassembler, capable of converting bytecode into
     * readable text.
     */
    class Disassembler {};
 }
@@ -0,0 +1,55 @@
 #pragma once
 #include <cstdint>
 #include <vector>
 #include <deque>
 #include <map>
 #include <optional>
 #include <string>
 #include <memory>
 #include <filesystem>
 #include <set>
 namespace spider {
    // Absolute Types
    using u8 = std::uint8_t;
    using u16 = std::uint16_t;
    using u32 = std::uint32_t;
    using u64 = std::uint64_t;
    using i8 = std::int8_t;
    using i16 = std::int16_t;
    using i32 = std::int32_t;
    using i64 = std::int64_t;
    using f32 = float; // TODO: SPIDER_EMULATE_FLOAT will control this
    using f64 = double;
    // TODO: Check if we're on C++23, there is already stdfloat
    static_assert(sizeof(f32) == 4, "The f32 type must be exactly 4 bytes.");
    static_assert(sizeof(f64) == 8, "The f64 type must be exactly 8 bytes.");
    // Utility types
    using isize = std::size_t;
    // Utility imports
    using std::vector;
    using std::deque;
    using std::map;
    using std::optional;
    using std::set;
    template<typename T> using ptr = std::shared_ptr<T>;
    template<typename T> using uptr = std::unique_ptr<T>;
    namespace fs = std::filesystem;
    struct pos {
        isize line;
        isize col;
        pos(isize line = 1, isize col = 1)
        : line(line), col(col) {}
    };
 }
@@ -0,0 +1,104 @@
 #include "TextReader.hpp"
 #include <spider/compiler/text/utf8.hpp>
 #include <stdexcept>
 namespace spider {
    // Text Reader //
    int TextReader::nextByte() {
        int ch = getStream().get();
        if (ch == std::istream::traits_type::eof()) {
            return -1;
        }
        return ch;
    }
    bool TextReader::nextChar(u32& ch) {
        int n = nextByte();
        if(n == -1) return false;
        isize len = utf8::seqlen(u8(n));
        if(len == 0) return false;
        isize i = 1;
        char arr[4];
        arr[0] = char(n);
        while(i < len) {
            n = nextByte();
            if(n == -1) return false;
            arr[i++] = char(n);
        }
        ch = utf8::decodeArr(arr, len);
        advance(ch);
        return true;
    }
    void TextReader::advance(u32 ch) {
        if (ch == u32('\n')) {
            if (lastWasCR) {
                lastWasCR = false; // Mixed CRLF handling
            } else {
                at.line++;
                at.col = 1;
            }
        } else if (ch == u32('\r')) {
            at.line++;
            at.col = 1;
            lastWasCR = true;
        } else {
            at.col++;
            lastWasCR = false;
        }
    }
    bool TextReader::isEOF() {
        return getStream().peek() == std::istream::traits_type::eof();
    }
    pos TextReader::getPosition() const {
        return at;
    }
    // File Reader //
    FileTextReader::FileTextReader(const std::string& filename)
    : fileStream(filename, std::ios::binary) {
        if (!fileStream.is_open()) {
            throw std::runtime_error("Failed to open file: " + filename);
        }
    }
    std::istream& FileTextReader::getStream() {
        return fileStream;
    }
    // String Reader //
    StringTextReader::StringTextReader(std::string initialText)
        : buffer(std::move(initialText)),
          stringStream(std::make_unique<std::istringstream>(buffer)) {
    }
    std::istream& StringTextReader::getStream() {
        return *stringStream;
    }
    void StringTextReader::set(const std::string& newText) {
        buffer = newText;
        stringStream = std::make_unique<std::istringstream>(buffer);
        lastWasCR = false;
    }
    void StringTextReader::append(const std::string& extraText) {
        std::streampos pos = stringStream->tellg();
        buffer += extraText;
        stringStream = std::make_unique<std::istringstream>(buffer);
        stringStream->seekg(pos);
    }
 }
@@ -0,0 +1,91 @@
 #pragma once
 #include <spider/compiler/common.hpp>
 #include <iostream>
 #include <fstream>
 #include <sstream>
 #include <string>
 #include <memory>
 namespace spider {
    /**
     * Abstract Text Reader
     */
    class TextReader {
    protected:
        pos at;
        bool lastWasCR = false;
    public:
        TextReader() = default;
        virtual ~TextReader() = default;
    protected:
        int nextByte();
    public:
        bool nextChar(u32& ch);
        bool isEOF();
        pos getPosition() const;
    protected:
        void advance(u32 ch);
        virtual std::istream& getStream() = 0;
    };
    /**
     * File Text Reader
     */
    class FileTextReader : public TextReader {
    private:
        std::ifstream fileStream;
    public:
        explicit FileTextReader(const std::string& filename);
    protected:
        std::istream& getStream() override;
    };
    /**
     * String Text Reader
     */
    class StringTextReader : public TextReader {
    private:
        std::string buffer;
        std::unique_ptr<std::istringstream> stringStream;
    public:
        explicit StringTextReader(std::string initialText = "");
    public:
        void set(const std::string& newText);
        void append(const std::string& extraText);
    protected:
        std::istream& getStream() override;
    };
 }
@@ -0,0 +1,91 @@
 #pragma once
 #include <spider/compiler/common.hpp>
 #include <cstdint>
 #include <cstddef>
 #include <string>
 namespace spider {
    namespace utf8 {
        // --------------------- //
        // UTF-8 Sequence Length //
        // --------------------- //
        constexpr isize seqlen(u8 c) {
            if ((c & 0x80) == 0x00) return 1;
            if ((c & 0xE0) == 0xC0) return 2;
            if ((c & 0xF0) == 0xE0) return 3;
            if ((c & 0xF8) == 0xF0) return 4;
            return 0;
        }
        constexpr bool isCont(u8 c) {
            return (c & 0xC0) == 0x80;
        }
        constexpr isize isValidSeq(const char* src, isize len) {
            if (len == 0) return 0;
            isize m = seqlen(u8(src[0]));
            if (m == 0 || m > len) return 0;
            for (isize i = 1; i < m; i++) {
                if (!isCont(u8(src[i]))) return 0;
            }
            return m;
        }
        // ----------------- //
        // UTF-8 into UTF-32 //
        // ----------------- //
        inline isize decode(const char* src, isize len, u32& out) {
            // check input is valid
            isize charlen = isValidSeq(src, len);
            if (charlen == 0) return 0;
            // map of masks, starts at 1
            static constexpr u8 firstMask[5] = {
                0x00, // unused
                0x7F, // 0xxxxxxx
                0x1F, // 110xxxxx
                0x0F, // 1110xxxx
                0x07  // 11110xxx
            };
            // assemble the char
            out = u8(src[0]) & firstMask[charlen];
            for (isize i = 1; i < charlen; ++i) {
                out <<= 6;
                out |= u8(src[i]) & 0x3F;
            }
            return charlen;
        }
        /**
         * A simpler version, which consider it already
         * having a validated input array
         */
        inline u32 decodeArr(const char* src, isize chlen) {
            // map of masks, starts at 1
            static constexpr u8 firstMask[5] = {
                0x00, // unused
                0x7F, // 0xxxxxxx
                0x1F, // 110xxxxx
                0x0F, // 1110xxxx
                0x07  // 11110xxx
            };
            // assemble the char
            u32 out = u8(src[0]) & firstMask[chlen];
            for (isize i = 1; i < chlen; ++i) {
                out <<= 6;
                out |= u8(src[i]) & 0x3F;
            }
            return out;
        }
    }
 }
@@ -0,0 +1,25 @@
 #pragma once
 #include <spider/compiler/common.hpp>
 namespace spider {
    /**
     * Defines the root of a token.
     */
    class RootToken {
    private:
    public:
        RootToken();
        ~RootToken();
    public:
        void token();
    };
 }
@@ -0,0 +1,26 @@
 #include "Token.hpp"
 namespace spider {
    Token::Token(pos _at, TokenType _type, std::string _str)
    : at(_at), type(_type), str(_str) {}
    Token::Token(const Token& tok)
    : at(tok.at), type(tok.type), str(tok.str), inner(tok.inner) {}
    Token::Token(Token&& tok)
    : at(tok.at), type(tok.type), str(tok.str), inner(std::move(tok.inner)) {}
    void Token::append(const Token& tok) {
        inner.push_back(tok);
    }
    vector<Token> Token::getInner() {
        return inner;
    }
    isize Token::innerCount() {
        return inner.size();
    }
 }
@@ -0,0 +1,56 @@
 #pragma once
 #include <spider/compiler/common.hpp>
 namespace spider {
    /**
     * Token type.
     */
    enum class TokenType {
        // Assembly
        PREPROCESSOR_TAG,
        WHITESPACE, NEWLINE,
        INSTRUCTION, OPCODE,
        OPERATOR, OPERAND,
        REGISTER, NUMBER, BIN_NUMBER, OCT_NUMBER, HEX_NUMBER, ADDR_NUMBER,
        BRACKET, BRACKET_IND, BRACKET_PTR, BRACKET_IDX, COMMA, COMMENT,
        SECTION, VARIABLE, ASSIGNMENT
        // Classic
        // Script
    };
    /**
     * Defines a general token.
     */
    class Token {
    public:
        const pos at;
        const TokenType type;
        const std::string str;
    private:
        vector<Token> inner;
    public:
        Token(pos _at, TokenType _type, std::string _str);
        Token(const Token& tok);
        Token(Token&& tok);
    public:
        void append(const Token& tok);
        vector<Token> getInner();
        isize innerCount();
    };
 }