From fb6fd3ff05439442c7a3bb3c6807455a049e87cc Mon Sep 17 00:00:00 2001
From: Arturo <cesarbalam49.cb@gmail.com>
Date: Thu, 19 Mar 2026 22:33:02 -0600
Subject: [PATCH] =?UTF-8?q?Add=20pygen.ipynb=20=E2=80=94=20Python=20code?=
 =?UTF-8?q?=20generator=20for=20CPU=20instructions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reads Spider_Instructions.xlsx and generates void METHOD(); declarations
into CPU.hpp between the pygen-target markers. Also exports addressing
mode and type size masks to InstructionMasks.hpp.

126 instructions currently defined. Reserved slots and incomplete
entries (Int 1-6 Slot) are skipped automatically.
---
 pygen.ipynb | 402 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 388 insertions(+), 14 deletions(-)

diff --git a/pygen.ipynb b/pygen.ipynb
index b9c5052..b75834b 100644
--- a/pygen.ipynb
+++ b/pygen.ipynb
@@ -18,9 +18,43 @@
    "execution_count": null,
    "id": "b0fcd533",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Repo root : /home/arturobalam/Documents/7thQuarter/Estancia_2/internship-repo/ArturoBalam-Internship2-repo/spider-runtime-folder/spider-runtime\n",
+      "CPU.hpp   : /home/arturobalam/Documents/7thQuarter/Estancia_2/internship-repo/ArturoBalam-Internship2-repo/spider-runtime-folder/spider-runtime/src/spider/runtime/cpu/CPU.hpp\n",
+      "XLSX      : /home/arturobalam/Documents/7thQuarter/Estancia_2/internship-repo/ArturoBalam-Internship2-repo/spider-runtime-folder/spider-runtime/Spider_Instructions.xlsx\n",
+      "Output dir: /home/arturobalam/Documents/7thQuarter/Estancia_2/internship-repo/ArturoBalam-Internship2-repo/spider-runtime-folder/spider-runtime/pygen_out\n"
+     ]
+    }
+   ],
    "source": [
-    "# setup directories"
+    "# setup directories\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "# Root of the Spider runtime repo — adjust this path to match your machine (folder where spider-runtime lives).\n",
+    "REPO_ROOT = os.path.abspath('/home/arturobalam/Documents/7thQuarter/Estancia_2/internship-repo/ArturoBalam-Internship2-repo/spider-runtime-folder/spider-runtime')\n",
+    "\n",
+    "# Where CPU.hpp lives — this is the file we will inject generated code into.\n",
+    "CPU_HPP_PATH = os.path.join(REPO_ROOT, 'src', 'spider', 'runtime', 'cpu', 'CPU.hpp')\n",
+    "\n",
+    "# Where the Excel instruction sheet lives. Allocate the .xlsx file in the project's root folder.\n",
+    "XLSX_PATH = os.path.join(REPO_ROOT, 'Spider_Instructions.xlsx')\n",
+    "\n",
+    "# Output folder for any standalone generated files.\n",
+    "OUT_DIR = os.path.join(REPO_ROOT, 'pygen_out')\n",
+    "\n",
+    "# Create the output directory if it does not exist yet.\n",
+    "# exist_ok=True means no error if it already exists.\n",
+    "os.makedirs(OUT_DIR, exist_ok=True)\n",
+    "\n",
+    "print(f'Repo root : {REPO_ROOT}')\n",
+    "print(f'CPU.hpp   : {CPU_HPP_PATH}')\n",
+    "print(f'XLSX      : {XLSX_PATH}')\n",
+    "print(f'Output dir: {OUT_DIR}')\n"
    ]
   },
   {
@@ -28,9 +62,75 @@
    "execution_count": null,
    "id": "b33de8ac",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--- Sample output for NOP ---\n",
+      "        // [System] 0x000 — NOP: No Operation\n",
+      "        //   Params: 0 | AddrMask1: 00 AddrMask2: 00 | TypeMask: 00\n",
+      "        //   Operation: Nothing\n",
+      "        void NOP();\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
-    "# Implement here some kind of \"C++\" printer"
+    "# Implement here some kind of \"C++\" printer\n",
+    "\n",
+    "# ── Indent used throughout the generated block ──────────────────────────────\n",
+    "INDENT = '        '  # 8 spaces — matches the indentation inside CPU.hpp\n",
+    "\n",
+    "def format_instruction(byte_code: str, mnemonic: str, name: str,\n",
+    "                        group: str, params: int,\n",
+    "                        addr_mask_1: str, addr_mask_2: str,\n",
+    "                        type_mask: str, operation: str) -> str:\n",
+    "    \"\"\"\n",
+    "    Returns a single C++ instruction declaration as a string.\n",
+    "\n",
+    "    Each instruction becomes a commented constant inside the CPU class.\n",
+    "    Format:\n",
+    "        // [GROUP] 0xBYTE — MNEMONIC: Name\n",
+    "        //   Params: N | AddrMask1: XX AddrMask2: XX | TypeMask: XX\n",
+    "        //   Operation: ...\n",
+    "        MNEMONIC\n",
+    "    \"\"\"\n",
+    "    lines = []\n",
+    "\n",
+    "    # Header comment: group, opcode, mnemonic and human-readable name.\n",
+    "    lines.append(f'{INDENT}// [{group}] 0x{byte_code} — {mnemonic}: {name}')\n",
+    "\n",
+    "    # Second comment line: parameter count, addressing masks, type size mask.\n",
+    "    lines.append(f'{INDENT}//   Params: {params} | '\n",
+    "                 f'AddrMask1: {addr_mask_1} AddrMask2: {addr_mask_2} | '\n",
+    "                 f'TypeMask: {type_mask}')\n",
+    "\n",
+    "    # Third comment line: what this instruction actually does.\n",
+    "    lines.append(f'{INDENT}//   Operation: {operation}')\n",
+    "\n",
+    "    # The declaration itself — just the mnemonic name, matching NOP/SPDR style.\n",
+    "    lines.append(f'{INDENT}void {mnemonic}();')          # method declaration inside CPU class  # enum value: NAME = 0xOPCODE,\n",
+    "\n",
+    "    # Empty line between instructions for readability.\n",
+    "    lines.append('')\n",
+    "\n",
+    "    return '\\n'.join(lines)\n",
+    "\n",
+    "\n",
+    "def format_block(instructions: list) -> str:\n",
+    "    \"\"\"\n",
+    "    Joins all individual instruction strings into one complete block.\n",
+    "    This is the text that will be injected between the pygen-target markers.\n",
+    "    \"\"\"\n",
+    "    # Join every formatted instruction into one big string.\n",
+    "    return '\\n'.join(instructions)\n",
+    "\n",
+    "\n",
+    "# Print what one instruction looks like.\n",
+    "sample = format_instruction('000','NOP','No Operation','System',0,'00','00','00','Nothing')\n",
+    "print('--- Sample output for NOP ---')\n",
+    "print(sample)\n"
    ]
   },
   {
@@ -38,35 +138,309 @@
    "execution_count": null,
    "id": "58645013",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Real instructions : 126\n",
+      "Reserved slots    : 14\n",
+      "Duplicate check   : PASSED\n",
+      "\n",
+      "Groups found:\n",
+      "group\n",
+      "Integer           19\n",
+      "System            15\n",
+      "Bit Wise          14\n",
+      "Boolean           12\n",
+      "Branch            12\n",
+      "Floating Point    10\n",
+      "Casts             10\n",
+      "Memory             9\n",
+      "Trigonometric      7\n",
+      "Exponential        6\n",
+      "Matrix             6\n",
+      "SIMD               5\n",
+      "Easter Eggs        1\n",
+      "\n",
+      "First 5 instructions:\n",
+      "  byte_code mnemonic   group  params addr_mask_1 type_mask\n",
+      "0       000      NOP  System       0          00        00\n",
+      "1       001     SPDR  System       0          00        00\n",
+      "2       002    MMODE  System       1          05        01\n",
+      "3       003      INT  System       1          1F        0F\n",
+      "4       004      LRV  System       1          1F        0C\n"
+     ]
+    }
+   ],
    "source": [
-    "# read the instruction sheet with pandas"
+    "# read the instruction sheet with pandas\n",
+    "\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "# -- Load --------------------------------------------------------------------\n",
+    "# The data is on the 'Instructions' sheet. Header is on row index 6 (0-based),\n",
+    "# so we skip the first 6 rows of decorative merged cells.\n",
+    "raw = pd.read_excel(XLSX_PATH, sheet_name='Instructions', header=6)\n",
+    "\n",
+    "# Rename the two unnamed columns that hold the two addressing mode masks.\n",
+    "# In the sheet they appear after 'Acc. Addr. Mode Mask' with no header label.\n",
+    "raw.columns = [\n",
+    "    'skip_0',       # empty column A\n",
+    "    'skip_1',       # 'Base Instr.' label column\n",
+    "    'byte_code',    # opcode hex string e.g. '000'\n",
+    "    'mnemonic',     # short name e.g. 'NOP'\n",
+    "    'name',         # full name e.g. 'No Operation'\n",
+    "    'group',        # category e.g. 'System'\n",
+    "    'params',       # number of parameters (0, 1, or 2)\n",
+    "    'imp',          # addressing mode: Implied\n",
+    "    'imm',          # addressing mode: Immediate\n",
+    "    'abs',          # addressing mode: Absolute\n",
+    "    'reg',          # addressing mode: Register\n",
+    "    'ind',          # addressing mode: Indirect\n",
+    "    'ptr',          # addressing mode: Pointer\n",
+    "    'idx',          # addressing mode: Indexed\n",
+    "    'sca',          # addressing mode: Scaled\n",
+    "    'dis',          # addressing mode: Displaced\n",
+    "    'addr_mask_1',  # accepted addressing mode mask for param 1\n",
+    "    'addr_mask_2',  # accepted addressing mode mask for param 2\n",
+    "    'B',            # type size: Byte (1 byte) supported?\n",
+    "    'S',            # type size: Short (2 bytes) supported?\n",
+    "    'I',            # type size: Int (4 bytes) supported?\n",
+    "    'L',            # type size: Long (8 bytes) supported?\n",
+    "    'F',            # type size: Float supported?\n",
+    "    'D',            # type size: Double supported?\n",
+    "    'type_mask',    # combined type size mask as hex string\n",
+    "    'operation',    # human-readable description of what the instruction does\n",
+    "    'skip_2',       # trailing empty column\n",
+    "]\n",
+    "\n",
+    "# ── Filter ───────────────────────────────────────────────────────────────────\n",
+    "# Keep only rows that have a byte_code value (drops empty rows at the bottom).\n",
+    "df = raw[raw['byte_code'].notna()].copy()\n",
+    "\n",
+    "# Separate reserved slots from real instructions.\n",
+    "# Reserved entries have '(reserved)' in the mnemonic column.\n",
+    "is_reserved = df['mnemonic'].astype(str).str.contains('reserved', case=False, na=False)\n",
+    "reserved_df  = df[is_reserved].copy()   # keep for reference\n",
+    "instrs_df    = df[~is_reserved & df['mnemonic'].notna()].copy()  # real instructions only\n",
+    "\n",
+    "# Skip incomplete entries — rows with no group are placeholder slots (e.g. Int 1-6 Slot)\n",
+    "# that have no defined behaviour yet. Keeping them would generate invalid C++ identifiers.\n",
+    "instrs_df = instrs_df[instrs_df['group'].notna()].copy()\n",
+    "\n",
+    "# ── Clean ────────────────────────────────────────────────────────────────────\n",
+    "# Fill NaN masks with '00' (means 'no modes accepted' — safe default).\n",
+    "instrs_df['addr_mask_1'] = instrs_df['addr_mask_1'].fillna('00').astype(str).str.strip()\n",
+    "instrs_df['addr_mask_2'] = instrs_df['addr_mask_2'].fillna('00').astype(str).str.strip()\n",
+    "instrs_df['type_mask']   = instrs_df['type_mask'].fillna('00').astype(str).str.strip()\n",
+    "instrs_df['params']      = instrs_df['params'].fillna(0).astype(int)\n",
+    "instrs_df['name']        = instrs_df['name'].fillna('').astype(str).str.strip()\n",
+    "instrs_df['group']       = instrs_df['group'].fillna('Unknown').astype(str).str.strip()\n",
+    "instrs_df['operation']   = instrs_df['operation'].fillna('').astype(str).str.strip()\n",
+    "\n",
+    "# ── Sanitize mnemonics ──────────────────────────────────────────────────────\n",
+    "# C++ identifiers cannot contain spaces. Replace spaces with underscores and\n",
+    "# convert to uppercase so 'Int 1 Slot' becomes 'INT_1_SLOT'.\n",
+    "instrs_df['mnemonic'] = (\n",
+    "    instrs_df['mnemonic']\n",
+    "    .astype(str)\n",
+    "    .str.strip()                  # remove leading/trailing whitespace\n",
+    "    .str.replace(' ', '_')        # replace internal spaces with underscores\n",
+    "    .str.upper()                  # uppercase for consistency\n",
+    ")\n",
+    "\n",
+    "# ── Validate: duplicate mnemonics ────────────────────────────────────────────\n",
+    "# Duplicates in real instruction names would cause C++ compilation errors.\n",
+    "# We abort here rather than generating broken code.\n",
+    "mnemonic_counts = instrs_df['mnemonic'].value_counts()\n",
+    "duplicates = mnemonic_counts[mnemonic_counts > 1]\n",
+    "if not duplicates.empty:\n",
+    "    # Show which mnemonics are duplicated before raising the error.\n",
+    "    raise ValueError(f'Duplicate mnemonics found — fix the sheet before generating:\\n{duplicates}')\n",
+    "\n",
+    "print(f'Real instructions : {len(instrs_df)}')\n",
+    "print(f'Reserved slots    : {len(reserved_df)}')\n",
+    "print(f'Duplicate check   : PASSED')\n",
+    "print(f'\\nGroups found:')\n",
+    "print(instrs_df['group'].value_counts().to_string())\n",
+    "print(f'\\nFirst 5 instructions:')\n",
+    "print(instrs_df[['byte_code','mnemonic','group','params','addr_mask_1','type_mask']].head().to_string())\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "452bc76c",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Masks written to: /home/arturobalam/Documents/7thQuarter/Estancia_2/internship-repo/ArturoBalam-Internship2-repo/spider-runtime-folder/spider-runtime/pygen_out/InstructionMasks.hpp\n",
+      "Lines generated : 268\n"
+     ]
+    }
+   ],
    "source": [
-    "# well, then export the masks (TODO)"
+    "# well, then export the masks (TODO)\n",
+    "\n",
+    "\n",
+    "# ── Build the masks header content ──────────────────────────────────────────\n",
+    "lines = []\n",
+    "\n",
+    "# Standard C++ header guard — prevents the file from being included more than once.\n",
+    "lines.append('#pragma once')\n",
+    "lines.append('// AUTO-GENERATED by pygen.ipynb — DO NOT EDIT MANUALLY')\n",
+    "lines.append('#include <spider/runtime/common.hpp>')\n",
+    "lines.append('')\n",
+    "lines.append('namespace spider {')\n",
+    "lines.append('')\n",
+    "\n",
+    "# ── Addressing mode mask table ───────────────────────────────────────────────\n",
+    "# Each instruction has two masks (one per parameter).\n",
+    "# We write them as a constexpr array so the VM can look them up at runtime\n",
+    "# using the opcode as the index.\n",
+    "lines.append('// Addressing mode masks — indexed by opcode.')\n",
+    "lines.append('// [opcode][0] = mask for param 1, [opcode][1] = mask for param 2')\n",
+    "lines.append('constexpr u8 ADDR_MODE_MASKS[][2] = {')\n",
+    "\n",
+    "for _, row in instrs_df.iterrows():\n",
+    "    # Convert the hex string mask to an integer for the C++ literal.\n",
+    "    m1 = row['addr_mask_1'].replace('.0','').strip()  # remove pandas float artefact\n",
+    "    m2 = row['addr_mask_2'].replace('.0','').strip()\n",
+    "    m1 = m1 if m1 != 'nan' else '00'\n",
+    "    m2 = m2 if m2 != 'nan' else '00'\n",
+    "    # Each row: { 0xMASK1, 0xMASK2 }, // MNEMONIC\n",
+    "    lines.append(f'    {{ 0x{m1.upper()}, 0x{m2.upper()} }},  // {row[\"mnemonic\"]}')\n",
+    "\n",
+    "lines.append('};')\n",
+    "lines.append('')\n",
+    "\n",
+    "# ── Type size mask table ─────────────────────────────────────────────────────\n",
+    "# A single byte per instruction encoding which type sizes it accepts.\n",
+    "lines.append('// Type size masks — indexed by opcode.')\n",
+    "lines.append('constexpr u8 TYPE_SIZE_MASKS[] = {')\n",
+    "\n",
+    "for _, row in instrs_df.iterrows():\n",
+    "    tm = str(row['type_mask']).replace('.0','').strip()\n",
+    "    tm = tm if tm != 'nan' else '00'\n",
+    "    lines.append(f'    0x{tm.upper()},  // {row[\"mnemonic\"]}')\n",
+    "\n",
+    "lines.append('};')\n",
+    "lines.append('')\n",
+    "lines.append('} // namespace spider')\n",
+    "\n",
+    "# ── Write to file ────────────────────────────────────────────────────────────\n",
+    "masks_path = os.path.join(OUT_DIR, 'InstructionMasks.hpp')\n",
+    "with open(masks_path, 'w', encoding='utf-8') as f:\n",
+    "    # Join with Unix line endings only — repo etiquette says no \\r\\n.\n",
+    "    f.write('\\n'.join(lines))\n",
+    "\n",
+    "print(f'Masks written to: {masks_path}')\n",
+    "print(f'Lines generated : {len(lines)}')\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "5aaebef0",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Instructions formatted: 126\n",
+      "\n",
+      "--- Preview (first 2 instructions) ---\n",
+      "        // [System] 0x000 — NOP: No Operation\n",
+      "        //   Params: 0 | AddrMask1: 00 AddrMask2: 00 | TypeMask: 00\n",
+      "        //   Operation: Nothing\n",
+      "        void NOP();\n",
+      "\n",
+      "        // [System] 0x001 — SPDR: Will place the Spider version of the interpreter in RA\n",
+      "        //   Params: 0 | AddrMask1: 00 AddrMask2: 00 | TypeMask: 00\n",
+      "        //   Operation: (Spider Version) -> RA\n",
+      "        void SPDR();\n",
+      "\n",
+      "\n",
+      "CPU.hpp updated successfully at: /home/arturobalam/Documents/7thQuarter/Estancia_2/internship-repo/ArturoBalam-Internship2-repo/spider-runtime-folder/spider-runtime/src/spider/runtime/cpu/CPU.hpp\n",
+      "Total lines in updated file: 674\n"
+     ]
+    }
+   ],
    "source": [
-    "# print the CPU Instructions"
+    "# print the CPU Instructions\n",
+    "\n",
+    "# ── Generate all instruction declarations ───────────────────────────────────\n",
+    "formatted = []\n",
+    "\n",
+    "for _, row in instrs_df.iterrows():\n",
+    "    # Clean each field — remove pandas float artefacts like '00.0'\n",
+    "    byte_code   = str(row['byte_code']).strip()\n",
+    "    mnemonic    = str(row['mnemonic']).strip()\n",
+    "    name        = str(row['name']).strip()\n",
+    "    group       = str(row['group']).strip()\n",
+    "    params      = int(row['params'])\n",
+    "    addr_mask_1 = str(row['addr_mask_1']).replace('.0', '').strip()\n",
+    "    addr_mask_2 = str(row['addr_mask_2']).replace('.0', '').strip()\n",
+    "    type_mask   = str(row['type_mask']).replace('.0', '').strip()\n",
+    "    operation   = str(row['operation']).strip()\n",
+    "\n",
+    "    # Call the C++ printer from Cell 2 to format this instruction.\n",
+    "    formatted.append(format_instruction(\n",
+    "        byte_code, mnemonic, name, group,\n",
+    "        params, addr_mask_1, addr_mask_2,\n",
+    "        type_mask, operation\n",
+    "    ))\n",
+    "\n",
+    "# Combine all declarations into one block string.\n",
+    "generated_block = format_block(formatted)\n",
+    "\n",
+    "print(f'Instructions formatted: {len(formatted)}')\n",
+    "print('\\n--- Preview (first 2 instructions) ---')\n",
+    "print('\\n'.join(formatted[:2]))\n",
+    "\n",
+    "# ── Inject into CPU.hpp ──────────────────────────────────────────────────────\n",
+    "# The markers tell us exactly where to insert the generated block.\n",
+    "MARKER_OPEN  = '// <pygen-target name=cpu-instructions> //'\n",
+    "MARKER_CLOSE = '// </pygen-target> //'\n",
+    "\n",
+    "# Read the current CPU.hpp content.\n",
+    "with open(CPU_HPP_PATH, 'r', encoding='utf-8') as f:\n",
+    "    original = f.read()\n",
+    "\n",
+    "# Verify both markers exist before modifying anything.\n",
+    "# If either is missing, the file was edited by hand — abort to avoid corruption.\n",
+    "if MARKER_OPEN not in original:\n",
+    "    raise ValueError(f'Open marker not found in CPU.hpp: {MARKER_OPEN}')\n",
+    "if MARKER_CLOSE not in original:\n",
+    "    raise ValueError(f'Close marker not found in CPU.hpp: {MARKER_CLOSE}')\n",
+    "\n",
+    "# Split the file into 3 parts around the pygen-target markers.\n",
+    "# before : everything up to and including the open marker\n",
+    "# after  : from the close marker onward (including it)\n",
+    "before = original[:original.index(MARKER_OPEN) + len(MARKER_OPEN)]\n",
+    "after  = original[original.index(MARKER_CLOSE):]\n",
+    "\n",
+    "# Reassemble: keep before, inject the generated block, then restore after.\n",
+    "updated = before + '\\n' + generated_block + '\\n' + INDENT + after\n",
+    "\n",
+    "# Write back using UTF-8 and Unix line endings only (repo etiquette: no \\r\\n).\n",
+    "with open(CPU_HPP_PATH, 'w', encoding='utf-8', newline='\\n') as f:\n",
+    "    f.write(updated)\n",
+    "\n",
+    "print(f'\\nCPU.hpp updated successfully at: {CPU_HPP_PATH}')\n",
+    "print(f'Total lines in updated file: {len(updated.splitlines())}')\n"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "spider-rntm-env",
    "language": "python",
    "name": "python3"
   },
@@ -80,7 +454,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,