beginning of compiler

This commit is contained in:
2026-06-20 11:53:08 -06:00
parent 5ddecb0c38
commit 9176c4882f
20 changed files with 1784 additions and 0 deletions
+6
View File
@@ -0,0 +1,6 @@
# For now, ignore user builds
# We will eventually change to a custom
# build system.
# So hold on
/bin
/out
+302
View File
@@ -0,0 +1,302 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 95,
"id": "00e26c5b",
"metadata": {},
"outputs": [],
"source": [
"from lark import Lark, Transformer\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "cc16be1a",
"metadata": {},
"outputs": [],
"source": [
"ebnf_targets = {\n",
" \"assembly\": {\n",
" \"src\": \"./samples/assembly.ebnf\",\n",
" \"dst\": \"./spider/compiler/assembly/AssemblyParser.hpp\",\n",
" \"cnt\": None,\n",
" },\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "e88d212f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"--- Loading EBNF Targets ---\n",
"✅ Success [assembly]: Loaded './samples/assembly.ebnf' -> Target destination: './spider/compiler/assembly/AssemblyParser.hpp'\n"
]
}
],
"source": [
"print(\"\\n--- Loading EBNF Targets ---\")\n",
"for target_name, paths in ebnf_targets.items():\n",
" src_path = paths[\"src\"]\n",
" dst_path = paths[\"dst\"]\n",
" \n",
" try:\n",
" with open(src_path, \"r\", encoding=\"utf-8\") as file:\n",
" paths[\"cnt\"] = file.read()\n",
" print(f\"✅ Success [{target_name}]: Loaded '{src_path}' -> Target destination: '{dst_path}'\")\n",
" \n",
" except FileNotFoundError:\n",
" print(f\"❌ Error [{target_name}]: Source file not found at '{src_path}'\")\n"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "e8095002",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from lark import Lark, Transformer\n",
"\n",
"iso_ebnf_meta_grammar = r\"\"\"\n",
" start: rule+\n",
" rule: RULE_NAME \"=\" expression \";\"\n",
" \n",
" ?expression: alternation\n",
" alternation: sequence (\"|\" sequence)*\n",
" \n",
" sequence: item ( [\",\"] item )*\n",
" \n",
" ?item: atom\n",
" | atom \"?\" -> optional\n",
" | atom \"*\" -> repeat\n",
" | \"[\" expression \"]\" -> optional\n",
" | \"{\" expression \"}\" -> repeat\n",
" \n",
" ?atom: RULE_NAME -> call_rule\n",
" | TERMINAL -> match_terminal\n",
" | SPECIAL_SEQ -> handle_special\n",
" | \"(\" expression \")\" -> group\n",
"\n",
" RULE_NAME: /[a-zA-Z_][a-zA-Z0-9_]*/\n",
" TERMINAL: /\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"/ | /'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'/\n",
" SPECIAL_SEQ: /\\?[\\s\\S]*?\\?/\n",
" COMMENT: /\\(\\*([\\s\\S]*?)\\*\\)/\n",
"\n",
" %import common.WS\n",
" %ignore WS\n",
" %ignore COMMENT\n",
"\"\"\"\n",
"\n",
"class AssemblyCppGenerator(Transformer):\n",
" def start(self, rules):\n",
" cpp_functions = \"\\n\\n\".join(rules)\n",
" return f\"\"\"#pragma once\n",
"\n",
"#include <iostream>\n",
"#include <string>\n",
"#include <vector>\n",
"#include <stdexcept>\n",
"\n",
"class AssemblyParser {{\n",
"private:\n",
" std::string src;\n",
" size_t pos = 0;\n",
"\n",
" std::string peek_str(size_t len) {{\n",
" if (pos + len <= src.length()) return src.substr(pos, len);\n",
" return src.substr(pos);\n",
" }}\n",
"\n",
" char peek() {{ return pos < src.length() ? src[pos] : '\\\\0'; }}\n",
" \n",
" void match_char(char expected) {{\n",
" if (peek() == expected) pos++;\n",
" else throw std::runtime_error(\"Unexpected token matching character\");\n",
" }}\n",
"\n",
" void match_string(std::string expected) {{\n",
" if (peek_str(expected.length()) == expected) pos += expected.length();\n",
" else throw std::runtime_error(\"Unexpected token matching string: \" + expected);\n",
" }}\n",
"\n",
" bool isUTF8Alpha() {{ return isalpha(peek()); }}\n",
" bool isWhithespaceCharNotCrLf() {{ return peek() == ' ' || peek() == '\\\\t'; }}\n",
" bool isUTF8CharNotCrLf() {{ return peek() != '\\\\r' && peek() != '\\\\n' && peek() != '\\\\0'; }}\n",
" bool isUTF8CharLitCont() {{ return peek() != '\\'' && peek() != '\\\\\\\\'; }}\n",
" bool isUTF8StringLitCont() {{ return peek() != '\"' && peek() != '\\\\\\\\'; }}\n",
"\n",
"public:\n",
" AssemblyParser(std::string input) : src(input) {{}}\n",
"\n",
" void parse() {{\n",
" parse_program(); \n",
" if (pos < src.length()) throw std::runtime_error(\"Trailing characters left unparsed.\");\n",
" std::cout << \"Assembly source compiled cleanly!\" << std::endl;\n",
" }}\n",
"\n",
"{cpp_functions}\n",
"}};\n",
"\"\"\"\n",
"\n",
" def rule(self, args):\n",
" name, expr = args\n",
" return f\" void parse_{name}() {{\\n{expr}\\n }}\"\n",
"\n",
" # FIX 1: Explicitly handle choice logic using C++ style paths\n",
" def alternation(self, items):\n",
" code_lines = []\n",
" for i, item in enumerate(items):\n",
" # Clean up padding whitespace if any\n",
" clean_item = str(item).strip()\n",
" if not clean_item: continue\n",
" \n",
" # Since lookahead processing requires FIRST sets, we scaffold a sequential fallback\n",
" if i == 0:\n",
" code_lines.append(f\" if (/* option {i+1} */ true) {{\\n {clean_item}\\n }}\")\n",
" else:\n",
" code_lines.append(f\" else if (/* option {i+1} */ true) {{\\n {clean_item}\\n }}\")\n",
" return \"\\n\".join(code_lines)\n",
"\n",
" def sequence(self, items):\n",
" flattened_items = []\n",
" for item in items:\n",
" if isinstance(item, list):\n",
" for sub_item in item:\n",
" if sub_item: flattened_items.append(str(sub_item).strip())\n",
" elif item:\n",
" flattened_items.append(str(item).strip())\n",
" return \"\\n\".join(f\" {item}\" for item in flattened_items if item)\n",
"\n",
" def call_rule(self, token):\n",
" rule_name = token[0].value if isinstance(token, list) else token.value\n",
" return f\"parse_{rule_name}();\"\n",
"\n",
" # FIX 2: Generate match_string instead of match_char for multi-char string keywords like \"include\"\n",
" def match_terminal(self, token):\n",
" raw_token_str = token[0].value if isinstance(token, list) else token.value\n",
" raw_val = raw_token_str[1:-1]\n",
" \n",
" if raw_val == r\"\\r\": return \"match_char('\\\\r');\"\n",
" if raw_val == r\"\\n\": return \"match_char('\\\\n');\"\n",
" if raw_val == r\"\\t\": return \"match_char('\\\\t');\"\n",
" if raw_val == r\"\\\\\": return \"match_char('\\\\\\\\');\"\n",
" if not raw_val: return \"// Empty string match\"\n",
" \n",
" if len(raw_val) > 1:\n",
" return f\"match_string(\\\"{raw_val}\\\");\"\n",
" return f\"match_char('{raw_val}');\"\n",
"\n",
" def handle_special(self, token):\n",
" raw_string = token[0].value if isinstance(token, list) else token.value\n",
" func_name = raw_string.strip('?').strip()\n",
" return f\"if ({func_name}()) {{ pos++; }} else {{ throw std::runtime_error(\\\"Failed validation for {func_name}\\\"); }}\"\n",
"\n",
" def optional(self, args):\n",
" content = args[0] if not isinstance(args[0], list) else \"\\n \".join(args[0])\n",
" return f\"// Optional block\\n if (/* lookahead check */ true) {{\\n {content}\\n }}\"\n",
"\n",
" def repeat(self, args):\n",
" content = args[0] if not isinstance(args[0], list) else \"\\n \".join(args[0])\n",
" return f\"// Repeat block\\n while (/* lookahead check */ true) {{\\n {content}\\n }}\"\n",
"\n",
" def group(self, args):\n",
" # Flatten grouped elements cleanly to strings\n",
" if isinstance(args, list):\n",
" return \"\\n\".join(str(x) for x in args)\n",
" return str(args)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "558915ff",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- Starting C++ Compilation Loop ---\n",
"Parsing and converting target rule sets for: assembly\n",
"🎉 Code generation complete! Output stored in './spider/compiler/assembly/AssemblyParser.hpp'\n"
]
}
],
"source": [
"print(\"--- Starting C++ Compilation Loop ---\")\n",
"\n",
"try:\n",
" meta_parser = Lark(iso_ebnf_meta_grammar, parser='lalr')\n",
" \n",
" for name, target in ebnf_targets.items():\n",
" print(f\"Parsing and converting target rule sets for: {name}\")\n",
" \n",
" # Build the compiler AST tree from your exact text\n",
" syntax_tree = meta_parser.parse(target[\"cnt\"])\n",
" \n",
" # Transform the AST structural nodes into pure C++ Source strings\n",
" compiler_transformer = AssemblyCppGenerator()\n",
" compiled_cpp_header = compiler_transformer.transform(syntax_tree)\n",
" \n",
" # Output directly to your destination path\n",
" os.makedirs(os.path.dirname(target[\"dst\"]), exist_ok=True)\n",
" with open(target[\"dst\"], \"w\", encoding=\"utf-8\") as f:\n",
" f.write(compiled_cpp_header)\n",
" \n",
" print(f\"🎉 Code generation complete! Output stored in '{target['dst']}'\")\n",
"\n",
"except Exception as e:\n",
" print(f\"❌ Failed to process custom architecture. Error details: \\n{e}\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "366688c3",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd1aca3f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+76
View File
@@ -0,0 +1,76 @@
(* Spider Assembly EBNF | Sintek Analytics @ 2026 | All Rights Reserved *)
(* Characters & Structures *)
letter = ? isUTF8Alpha ? ;
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
alpha_num_char = letter | digit ;
hex_digit = digit | "A" | "B" | "C" | "D" | "E" | "F" | "a" | "b" | "c" | "d" | "e" | "f" ;
octal_digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" ;
binary_digit = "0" | "1" ;
ws_char = ? isWhithespaceCharNotCrLf ? ;
ws_optional = { ws_char } ;
whitespace = ws_char , { ws_char } ;
newline = "\r" | "\n" | "\r\n" ;
utf8_char = ? isUTF8CharNotCrLf ? ;
char_escape = "\\", utf8_char ;
char_content = char_escape | ? isUTF8CharLitCont ? ; (* Not ' or \ *)
char_lit = "'", char_content, "'" ;
string_char = char_escape | ? isUTF8StringLitCont ? ; (* Not " or \ *)
string_lit = '"', { string_char }, '"' ;
(* Literals *)
identifier = ( letter | "_" ) , { alpha_num_char | "_" } ;
comment = ";" , { utf8_char } ;
sign = "+" | "-" ;
exponent_marker = "e" | "E" ;
exponent = exponent_marker , [ sign ] , digit , { digit } ;
decimal_lit = [ sign ] , digit , { digit } , [ "B" | "S" | "I" | "L" ] ;
float_lit = [ sign ] , (
( digit , { digit } , "." , digit , { digit } , [ exponent ] ) |
( "." , digit , { digit } , [ exponent ] ) |
( digit , { digit } , exponent )
) , [ "F" | "D" ] ;
hex_lit = [ sign ] , "0x" , hex_digit , { hex_digit } ;
octal_lit = [ sign ] , "0c" , octal_digit , { octal_digit } ;
binary_lit = [ sign ] , "0b" , binary_digit , { binary_digit } ;
literal = decimal_lit | float_lit | hex_lit | octal_lit | binary_lit | string_lit | char_lit ;
literal_cast = ("B" | "S" | "I" | "L" | "F" | "D"), ws_optional, "(", ws_optional, literal, ws_optional, ")" ;
literal_decl = literal | literal_cast ;
(* Operands *)
register = "R" , alpha_num_char , alpha_num_char ;
addrm_ind = "[", ws_optional, literal_decl, ws_optional, "]" ;
addrm_ptr = "[", ws_optional, register, ws_optional, "]" ;
addrm_idx = "[", ws_optional, register, ws_optional, "+", ws_optional, literal_decl, ws_optional, "]";
addrm_sca = "[", ws_optional, register, ws_optional, "+", register, ws_optional, "*", ws_optional, literal_decl, ws_optional, "]";
addrm_dis = "[", ws_optional, register, ws_optional, "+", register, ws_optional, "*", ws_optional, literal_decl, ws_optional, "+", ws_optional, literal_decl, ws_optional, "]";
addr_modes = addrm_ind | addrm_ptr | addrm_idx | addrm_sca | addrm_dis ;
operand = register | identifier | literal_decl | addr_modes ;
(* Generalized Instructions *)
opcode = letter , { alpha_num_char } ;
operand_list = operand , { "," , ws_optional , operand } ;
instruction = opcode , [ whitespace , operand_list ] ;
(* Added Preprocessor, Sections, and Metadata Syntaxes *)
include_decl = "include", whitespace, string_lit ;
annotation_oper = identifier, [ ws_optional, "=", ws_optional, literal_decl ] ;
annotation_ops = annotation_oper , { ws_optional, "," , ws_optional , annotation_oper } ;
annotation_args = "(", ws_optional, annotation_ops, ws_optional, ")" ;
annotation = "@", identifier, [ annotation_args ] ;
section_decl = "section", whitespace, ".", identifier ;
(* Line Structure *)
label = identifier, ":" ;
line_content = include_decl | section_decl | ( [ annotation, whitespace ], [ label, ws_optional ], [ instruction ] ) ;
line = ws_optional, [ line_content ], ws_optional , [ comment ] , newline ;
line_last = ws_optional, [ line_content ], ws_optional , [ comment ] ;
program = { line }, [ line_last ] ;
+11
View File
@@ -0,0 +1,11 @@
@asm
.data
.code
MOV RA, 1
MOV RB, 8 ; Input number
:loop_start
MUL RA, RB
NOT RB ; RB != 0? Updates equal flag
DEC RB ; RB -= 1
JEQ loop_start ; If equal flag, goto loop_start
; End program, result in RA
+21
View File
@@ -0,0 +1,21 @@
{
"folders": [
{
"path": "."
}
],
"settings": {
"gitlens.remotes": [
{
"domain": "git.sintekanalytics.com",
"type": "Gitea",
"name": "Sintek Analytics' Git",
"protocol": "https",
}
],
"C_Cpp.default.includePath": [
"./src"
],
"terminal.integrated.defaultProfile.windows": "MSYS2 UCRT"
}
}
+812
View File
@@ -0,0 +1,812 @@
#pragma once
#include <iostream>
#include <string>
#include <vector>
#include <stdexcept>
class AssemblyParser {
private:
std::string src;
size_t pos = 0;
std::string peek_str(size_t len) {
if (pos + len <= src.length()) return src.substr(pos, len);
return src.substr(pos);
}
char peek() { return pos < src.length() ? src[pos] : '\0'; }
void match_char(char expected) {
if (peek() == expected) pos++;
else throw std::runtime_error("Unexpected token matching character");
}
void match_string(std::string expected) {
if (peek_str(expected.length()) == expected) pos += expected.length();
else throw std::runtime_error("Unexpected token matching string: " + expected);
}
bool isUTF8Alpha() { return isalpha(peek()); }
bool isWhithespaceCharNotCrLf() { return peek() == ' ' || peek() == '\t'; }
bool isUTF8CharNotCrLf() { return peek() != '\r' && peek() != '\n' && peek() != '\0'; }
bool isUTF8CharLitCont() { return peek() != '\'' && peek() != '\\'; }
bool isUTF8StringLitCont() { return peek() != '"' && peek() != '\\'; }
public:
AssemblyParser(std::string input) : src(input) {}
void parse() {
parse_program();
if (pos < src.length()) throw std::runtime_error("Trailing characters left unparsed.");
std::cout << "Assembly source compiled cleanly!" << std::endl;
}
void parse_letter() {
if (/* option 1 */ true) {
if (isUTF8Alpha()) { pos++; } else { throw std::runtime_error("Failed validation for isUTF8Alpha"); }
}
}
void parse_digit() {
if (/* option 1 */ true) {
match_char('0');
} else if (/* option 2 */ true) {
match_char('1');
} else if (/* option 3 */ true) {
match_char('2');
} else if (/* option 4 */ true) {
match_char('3');
} else if (/* option 5 */ true) {
match_char('4');
} else if (/* option 6 */ true) {
match_char('5');
} else if (/* option 7 */ true) {
match_char('6');
} else if (/* option 8 */ true) {
match_char('7');
} else if (/* option 9 */ true) {
match_char('8');
} else if (/* option 10 */ true) {
match_char('9');
}
}
void parse_alpha_num_char() {
if (/* option 1 */ true) {
parse_letter();
} else if (/* option 2 */ true) {
parse_digit();
}
}
void parse_hex_digit() {
if (/* option 1 */ true) {
parse_digit();
} else if (/* option 2 */ true) {
match_char('A');
} else if (/* option 3 */ true) {
match_char('B');
} else if (/* option 4 */ true) {
match_char('C');
} else if (/* option 5 */ true) {
match_char('D');
} else if (/* option 6 */ true) {
match_char('E');
} else if (/* option 7 */ true) {
match_char('F');
} else if (/* option 8 */ true) {
match_char('a');
} else if (/* option 9 */ true) {
match_char('b');
} else if (/* option 10 */ true) {
match_char('c');
} else if (/* option 11 */ true) {
match_char('d');
} else if (/* option 12 */ true) {
match_char('e');
} else if (/* option 13 */ true) {
match_char('f');
}
}
void parse_octal_digit() {
if (/* option 1 */ true) {
match_char('0');
} else if (/* option 2 */ true) {
match_char('1');
} else if (/* option 3 */ true) {
match_char('2');
} else if (/* option 4 */ true) {
match_char('3');
} else if (/* option 5 */ true) {
match_char('4');
} else if (/* option 6 */ true) {
match_char('5');
} else if (/* option 7 */ true) {
match_char('6');
} else if (/* option 8 */ true) {
match_char('7');
}
}
void parse_binary_digit() {
if (/* option 1 */ true) {
match_char('0');
} else if (/* option 2 */ true) {
match_char('1');
}
}
void parse_ws_char() {
if (/* option 1 */ true) {
if (isWhithespaceCharNotCrLf()) { pos++; } else { throw std::runtime_error("Failed validation for isWhithespaceCharNotCrLf"); }
}
}
void parse_ws_optional() {
if (/* option 1 */ true) {
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_ws_char();
}
}
}
}
void parse_whitespace() {
if (/* option 1 */ true) {
parse_ws_char();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_ws_char();
}
}
}
}
void parse_newline() {
if (/* option 1 */ true) {
match_char('\r');
} else if (/* option 2 */ true) {
match_char('\n');
} else if (/* option 3 */ true) {
match_string("\r\n");
}
}
void parse_utf8_char() {
if (/* option 1 */ true) {
if (isUTF8CharNotCrLf()) { pos++; } else { throw std::runtime_error("Failed validation for isUTF8CharNotCrLf"); }
}
}
void parse_char_escape() {
if (/* option 1 */ true) {
match_char('\\');
parse_utf8_char();
}
}
void parse_char_content() {
if (/* option 1 */ true) {
parse_char_escape();
} else if (/* option 2 */ true) {
if (isUTF8CharLitCont()) { pos++; } else { throw std::runtime_error("Failed validation for isUTF8CharLitCont"); }
}
}
void parse_char_lit() {
if (/* option 1 */ true) {
match_char('\'');
parse_char_content();
match_char('\'');
}
}
void parse_string_char() {
if (/* option 1 */ true) {
parse_char_escape();
} else if (/* option 2 */ true) {
if (isUTF8StringLitCont()) { pos++; } else { throw std::runtime_error("Failed validation for isUTF8StringLitCont"); }
}
}
void parse_string_lit() {
if (/* option 1 */ true) {
match_char('"');
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_string_char();
}
}
match_char('"');
}
}
void parse_identifier() {
if (/* option 1 */ true) {
if (/* option 1 */ true) {
parse_letter();
} else if (/* option 2 */ true) {
match_char('_');
}
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_alpha_num_char();
} else if (/* option 2 */ true) {
match_char('_');
}
}
}
}
void parse_comment() {
if (/* option 1 */ true) {
match_char(';');
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_utf8_char();
}
}
}
}
void parse_sign() {
if (/* option 1 */ true) {
match_char('+');
} else if (/* option 2 */ true) {
match_char('-');
}
}
void parse_exponent_marker() {
if (/* option 1 */ true) {
match_char('e');
} else if (/* option 2 */ true) {
match_char('E');
}
}
void parse_exponent() {
if (/* option 1 */ true) {
parse_exponent_marker();
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_sign();
}
}
parse_digit();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_digit();
}
}
}
}
void parse_decimal_lit() {
if (/* option 1 */ true) {
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_sign();
}
}
parse_digit();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_digit();
}
}
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
match_char('B');
} else if (/* option 2 */ true) {
match_char('S');
} else if (/* option 3 */ true) {
match_char('I');
} else if (/* option 4 */ true) {
match_char('L');
}
}
}
}
void parse_float_lit() {
if (/* option 1 */ true) {
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_sign();
}
}
if (/* option 1 */ true) {
if (/* option 1 */ true) {
parse_digit();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_digit();
}
}
match_char('.');
parse_digit();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_digit();
}
}
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_exponent();
}
}
}
} else if (/* option 2 */ true) {
if (/* option 1 */ true) {
match_char('.');
parse_digit();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_digit();
}
}
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_exponent();
}
}
}
} else if (/* option 3 */ true) {
if (/* option 1 */ true) {
parse_digit();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_digit();
}
}
parse_exponent();
}
}
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
match_char('F');
} else if (/* option 2 */ true) {
match_char('D');
}
}
}
}
void parse_hex_lit() {
if (/* option 1 */ true) {
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_sign();
}
}
match_string("0x");
parse_hex_digit();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_hex_digit();
}
}
}
}
void parse_octal_lit() {
if (/* option 1 */ true) {
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_sign();
}
}
match_string("0c");
parse_octal_digit();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_octal_digit();
}
}
}
}
void parse_binary_lit() {
if (/* option 1 */ true) {
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_sign();
}
}
match_string("0b");
parse_binary_digit();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_binary_digit();
}
}
}
}
void parse_literal() {
if (/* option 1 */ true) {
parse_decimal_lit();
} else if (/* option 2 */ true) {
parse_float_lit();
} else if (/* option 3 */ true) {
parse_hex_lit();
} else if (/* option 4 */ true) {
parse_octal_lit();
} else if (/* option 5 */ true) {
parse_binary_lit();
} else if (/* option 6 */ true) {
parse_string_lit();
} else if (/* option 7 */ true) {
parse_char_lit();
}
}
void parse_literal_cast() {
if (/* option 1 */ true) {
if (/* option 1 */ true) {
match_char('B');
} else if (/* option 2 */ true) {
match_char('S');
} else if (/* option 3 */ true) {
match_char('I');
} else if (/* option 4 */ true) {
match_char('L');
} else if (/* option 5 */ true) {
match_char('F');
} else if (/* option 6 */ true) {
match_char('D');
}
parse_ws_optional();
match_char('(');
parse_ws_optional();
parse_literal();
parse_ws_optional();
match_char(')');
}
}
void parse_literal_decl() {
if (/* option 1 */ true) {
parse_literal();
} else if (/* option 2 */ true) {
parse_literal_cast();
}
}
void parse_register() {
if (/* option 1 */ true) {
match_char('R');
parse_alpha_num_char();
parse_alpha_num_char();
}
}
void parse_addrm_ind() {
if (/* option 1 */ true) {
match_char('[');
parse_ws_optional();
parse_literal_decl();
parse_ws_optional();
match_char(']');
}
}
void parse_addrm_ptr() {
if (/* option 1 */ true) {
match_char('[');
parse_ws_optional();
parse_register();
parse_ws_optional();
match_char(']');
}
}
void parse_addrm_idx() {
if (/* option 1 */ true) {
match_char('[');
parse_ws_optional();
parse_register();
parse_ws_optional();
match_char('+');
parse_ws_optional();
parse_literal_decl();
parse_ws_optional();
match_char(']');
}
}
void parse_addrm_sca() {
if (/* option 1 */ true) {
match_char('[');
parse_ws_optional();
parse_register();
parse_ws_optional();
match_char('+');
parse_register();
parse_ws_optional();
match_char('*');
parse_ws_optional();
parse_literal_decl();
parse_ws_optional();
match_char(']');
}
}
void parse_addrm_dis() {
if (/* option 1 */ true) {
match_char('[');
parse_ws_optional();
parse_register();
parse_ws_optional();
match_char('+');
parse_register();
parse_ws_optional();
match_char('*');
parse_ws_optional();
parse_literal_decl();
parse_ws_optional();
match_char('+');
parse_ws_optional();
parse_literal_decl();
parse_ws_optional();
match_char(']');
}
}
void parse_addr_modes() {
if (/* option 1 */ true) {
parse_addrm_ind();
} else if (/* option 2 */ true) {
parse_addrm_ptr();
} else if (/* option 3 */ true) {
parse_addrm_idx();
} else if (/* option 4 */ true) {
parse_addrm_sca();
} else if (/* option 5 */ true) {
parse_addrm_dis();
}
}
void parse_operand() {
if (/* option 1 */ true) {
parse_register();
} else if (/* option 2 */ true) {
parse_identifier();
} else if (/* option 3 */ true) {
parse_literal_decl();
} else if (/* option 4 */ true) {
parse_addr_modes();
}
}
void parse_opcode() {
if (/* option 1 */ true) {
parse_letter();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_alpha_num_char();
}
}
}
}
void parse_operand_list() {
if (/* option 1 */ true) {
parse_operand();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
match_char(',');
parse_ws_optional();
parse_operand();
}
}
}
}
void parse_instruction() {
if (/* option 1 */ true) {
parse_opcode();
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_whitespace();
parse_operand_list();
}
}
}
}
void parse_include_decl() {
if (/* option 1 */ true) {
match_string("include");
parse_whitespace();
parse_string_lit();
}
}
void parse_annotation_oper() {
if (/* option 1 */ true) {
parse_identifier();
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_ws_optional();
match_char('=');
parse_ws_optional();
parse_literal_decl();
}
}
}
}
void parse_annotation_ops() {
if (/* option 1 */ true) {
parse_annotation_oper();
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_ws_optional();
match_char(',');
parse_ws_optional();
parse_annotation_oper();
}
}
}
}
void parse_annotation_args() {
if (/* option 1 */ true) {
match_char('(');
parse_ws_optional();
parse_annotation_ops();
parse_ws_optional();
match_char(')');
}
}
void parse_annotation() {
if (/* option 1 */ true) {
match_char('@');
parse_identifier();
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_annotation_args();
}
}
}
}
void parse_section_decl() {
if (/* option 1 */ true) {
match_string("section");
parse_whitespace();
match_char('.');
parse_identifier();
}
}
void parse_label() {
if (/* option 1 */ true) {
parse_identifier();
match_char(':');
}
}
void parse_line_content() {
if (/* option 1 */ true) {
parse_include_decl();
} else if (/* option 2 */ true) {
parse_section_decl();
} else if (/* option 3 */ true) {
if (/* option 1 */ true) {
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_annotation();
parse_whitespace();
}
}
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_label();
parse_ws_optional();
}
}
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_instruction();
}
}
}
}
}
void parse_line() {
if (/* option 1 */ true) {
parse_ws_optional();
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_line_content();
}
}
parse_ws_optional();
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_comment();
}
}
parse_newline();
}
}
void parse_line_last() {
if (/* option 1 */ true) {
parse_ws_optional();
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_line_content();
}
}
parse_ws_optional();
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_comment();
}
}
}
}
void parse_program() {
if (/* option 1 */ true) {
// Repeat block
while (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_line();
}
}
// Optional block
if (/* lookahead check */ true) {
if (/* option 1 */ true) {
parse_line_last();
}
}
}
}
};
View File
+10
View File
@@ -0,0 +1,10 @@
#pragma
#include <spider/compiler/common.hpp>
namespace spider {
class Token;
class RootToken;
}
@@ -0,0 +1,35 @@
#include "Assembler.hpp"
namespace spider {
Assembler::Assembler() {}
Assembler::~Assembler() {}
Assembler::Error Assembler::loadFile(const fs::path& path) {
// check if path exists
fs::path abs_path = fs::canonical(path);
if(!fs::exists(abs_path)) return Error::FILE_NOT_FOUND;
// check if recursive
if(fstack.contains(abs_path)) return Error::FILE_RECURSIVE_LOAD;
auto ir = fstack.insert(abs_path);
// Actually load!
levels.emplace_back(Level {
.reader = std::make_unique<TextReader>(new FileTextReader(abs_path.string())),
.source = abs_path.string(),
});
parseCurrentLevel();
// alright!
fstack.erase(ir.first);
return Error::SUCCESS;
}
void Assembler::parseCurrentLevel() {
auto& lvl = levels.back();
}
}
@@ -0,0 +1,52 @@
#pragma once
#include <spider/compiler/common.hpp>
#include <spider/compiler/text/TextReader.hpp>
#include <spider/compiler/tokens/RootToken.hpp>
namespace spider {
/**
* The spider assembler, capable of
* converting text into bytecode.
*/
class Assembler {
public:
enum class Error {
SUCCESS,
FILE_NOT_FOUND, FILE_RECURSIVE_LOAD,
};
struct Level {
uptr<TextReader> reader;
RootToken root;
std::string source;
};
public:
set<fs::path> fstack;
deque<Level> levels;
public:
Assembler();
~Assembler();
public:
/**
* Attempts to load a file, fails if it
* doesn't exist.
*/
Error loadFile(const fs::path& path);
private:
void parseCurrentLevel();
};
}
@@ -0,0 +1,11 @@
#pragma once
namespace spider {
/**
* A disassembler, capable of converting bytecode into
* readable text.
*/
class Disassembler {};
}
+55
View File
@@ -0,0 +1,55 @@
#pragma once
#include <cstdint>
#include <vector>
#include <deque>
#include <map>
#include <optional>
#include <string>
#include <memory>
#include <filesystem>
#include <set>
namespace spider {
// Absolute Types
using u8 = std::uint8_t;
using u16 = std::uint16_t;
using u32 = std::uint32_t;
using u64 = std::uint64_t;
using i8 = std::int8_t;
using i16 = std::int16_t;
using i32 = std::int32_t;
using i64 = std::int64_t;
using f32 = float; // TODO: SPIDER_EMULATE_FLOAT will control this
using f64 = double;
// TODO: Check if we're on C++23, there is already stdfloat
static_assert(sizeof(f32) == 4, "The f32 type must be exactly 4 bytes.");
static_assert(sizeof(f64) == 8, "The f64 type must be exactly 8 bytes.");
// Utility types
using isize = std::size_t;
// Utility imports
using std::vector;
using std::deque;
using std::map;
using std::optional;
using std::set;
template<typename T> using ptr = std::shared_ptr<T>;
template<typename T> using uptr = std::unique_ptr<T>;
namespace fs = std::filesystem;
struct pos {
isize line;
isize col;
pos(isize line = 1, isize col = 1)
: line(line), col(col) {}
};
}
+104
View File
@@ -0,0 +1,104 @@
#include "TextReader.hpp"
#include <spider/compiler/text/utf8.hpp>
#include <stdexcept>
namespace spider {
// Text Reader //
int TextReader::nextByte() {
int ch = getStream().get();
if (ch == std::istream::traits_type::eof()) {
return -1;
}
return ch;
}
bool TextReader::nextChar(u32& ch) {
int n = nextByte();
if(n == -1) return false;
isize len = utf8::seqlen(u8(n));
if(len == 0) return false;
isize i = 1;
char arr[4];
arr[0] = char(n);
while(i < len) {
n = nextByte();
if(n == -1) return false;
arr[i++] = char(n);
}
ch = utf8::decodeArr(arr, len);
advance(ch);
return true;
}
void TextReader::advance(u32 ch) {
if (ch == u32('\n')) {
if (lastWasCR) {
lastWasCR = false; // Mixed CRLF handling
} else {
at.line++;
at.col = 1;
}
} else if (ch == u32('\r')) {
at.line++;
at.col = 1;
lastWasCR = true;
} else {
at.col++;
lastWasCR = false;
}
}
bool TextReader::isEOF() {
return getStream().peek() == std::istream::traits_type::eof();
}
pos TextReader::getPosition() const {
return at;
}
// File Reader //
FileTextReader::FileTextReader(const std::string& filename)
: fileStream(filename, std::ios::binary) {
if (!fileStream.is_open()) {
throw std::runtime_error("Failed to open file: " + filename);
}
}
std::istream& FileTextReader::getStream() {
return fileStream;
}
// String Reader //
StringTextReader::StringTextReader(std::string initialText)
: buffer(std::move(initialText)),
stringStream(std::make_unique<std::istringstream>(buffer)) {
}
std::istream& StringTextReader::getStream() {
return *stringStream;
}
void StringTextReader::set(const std::string& newText) {
buffer = newText;
stringStream = std::make_unique<std::istringstream>(buffer);
lastWasCR = false;
}
void StringTextReader::append(const std::string& extraText) {
std::streampos pos = stringStream->tellg();
buffer += extraText;
stringStream = std::make_unique<std::istringstream>(buffer);
stringStream->seekg(pos);
}
}
+91
View File
@@ -0,0 +1,91 @@
#pragma once
#include <spider/compiler/common.hpp>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <memory>
namespace spider {
/**
* Abstract Text Reader
*/
class TextReader {
protected:
pos at;
bool lastWasCR = false;
public:
TextReader() = default;
virtual ~TextReader() = default;
protected:
int nextByte();
public:
bool nextChar(u32& ch);
bool isEOF();
pos getPosition() const;
protected:
void advance(u32 ch);
virtual std::istream& getStream() = 0;
};
/**
* File Text Reader
*/
class FileTextReader : public TextReader {
private:
std::ifstream fileStream;
public:
explicit FileTextReader(const std::string& filename);
protected:
std::istream& getStream() override;
};
/**
* String Text Reader
*/
class StringTextReader : public TextReader {
private:
std::string buffer;
std::unique_ptr<std::istringstream> stringStream;
public:
explicit StringTextReader(std::string initialText = "");
public:
void set(const std::string& newText);
void append(const std::string& extraText);
protected:
std::istream& getStream() override;
};
}
+91
View File
@@ -0,0 +1,91 @@
#pragma once
#include <spider/compiler/common.hpp>
#include <cstdint>
#include <cstddef>
#include <string>
namespace spider {
namespace utf8 {
// --------------------- //
// UTF-8 Sequence Length //
// --------------------- //
constexpr isize seqlen(u8 c) {
if ((c & 0x80) == 0x00) return 1;
if ((c & 0xE0) == 0xC0) return 2;
if ((c & 0xF0) == 0xE0) return 3;
if ((c & 0xF8) == 0xF0) return 4;
return 0;
}
constexpr bool isCont(u8 c) {
return (c & 0xC0) == 0x80;
}
constexpr isize isValidSeq(const char* src, isize len) {
if (len == 0) return 0;
isize m = seqlen(u8(src[0]));
if (m == 0 || m > len) return 0;
for (isize i = 1; i < m; i++) {
if (!isCont(u8(src[i]))) return 0;
}
return m;
}
// ----------------- //
// UTF-8 into UTF-32 //
// ----------------- //
inline isize decode(const char* src, isize len, u32& out) {
// check input is valid
isize charlen = isValidSeq(src, len);
if (charlen == 0) return 0;
// map of masks, starts at 1
static constexpr u8 firstMask[5] = {
0x00, // unused
0x7F, // 0xxxxxxx
0x1F, // 110xxxxx
0x0F, // 1110xxxx
0x07 // 11110xxx
};
// assemble the char
out = u8(src[0]) & firstMask[charlen];
for (isize i = 1; i < charlen; ++i) {
out <<= 6;
out |= u8(src[i]) & 0x3F;
}
return charlen;
}
/**
* A simpler version, which consider it already
* having a validated input array
*/
inline u32 decodeArr(const char* src, isize chlen) {
// map of masks, starts at 1
static constexpr u8 firstMask[5] = {
0x00, // unused
0x7F, // 0xxxxxxx
0x1F, // 110xxxxx
0x0F, // 1110xxxx
0x07 // 11110xxx
};
// assemble the char
u32 out = u8(src[0]) & firstMask[chlen];
for (isize i = 1; i < chlen; ++i) {
out <<= 6;
out |= u8(src[i]) & 0x3F;
}
return out;
}
}
}
+25
View File
@@ -0,0 +1,25 @@
#pragma once
#include <spider/compiler/common.hpp>
namespace spider {
/**
* Defines the root of a token.
*/
class RootToken {
private:
public:
RootToken();
~RootToken();
public:
void token();
};
}
+26
View File
@@ -0,0 +1,26 @@
#include "Token.hpp"
namespace spider {
Token::Token(pos _at, TokenType _type, std::string _str)
: at(_at), type(_type), str(_str) {}
Token::Token(const Token& tok)
: at(tok.at), type(tok.type), str(tok.str), inner(tok.inner) {}
Token::Token(Token&& tok)
: at(tok.at), type(tok.type), str(tok.str), inner(std::move(tok.inner)) {}
void Token::append(const Token& tok) {
inner.push_back(tok);
}
vector<Token> Token::getInner() {
return inner;
}
isize Token::innerCount() {
return inner.size();
}
}
+56
View File
@@ -0,0 +1,56 @@
#pragma once
#include <spider/compiler/common.hpp>
namespace spider {
/**
* Token type.
*/
enum class TokenType {
// Assembly
PREPROCESSOR_TAG,
WHITESPACE, NEWLINE,
INSTRUCTION, OPCODE,
OPERATOR, OPERAND,
REGISTER, NUMBER, BIN_NUMBER, OCT_NUMBER, HEX_NUMBER, ADDR_NUMBER,
BRACKET, BRACKET_IND, BRACKET_PTR, BRACKET_IDX, COMMA, COMMENT,
SECTION, VARIABLE, ASSIGNMENT
// Classic
// Script
};
/**
* Defines a general token.
*/
class Token {
public:
const pos at;
const TokenType type;
const std::string str;
private:
vector<Token> inner;
public:
Token(pos _at, TokenType _type, std::string _str);
Token(const Token& tok);
Token(Token&& tok);
public:
void append(const Token& tok);
vector<Token> getInner();
isize innerCount();
};
}