From 44ca31b30978935283b540159e2bd4e073d6fa56 Mon Sep 17 00:00:00 2001
From: "kai [they]" <coilysiren@gmail.com>
Date: Sun, 22 Oct 2023 03:36:06 -0700
Subject: [PATCH] SQL `INSERT INTO` in python (#88)

---
 data/sql_input_2.sql        |  21 ++++
 data/sql_input_3.sql        |  21 ++++
 data/sql_output_0.json      |   4 +-
 data/sql_output_1.json      |   4 +-
 data/sql_output_2.json      |   4 +
 data/sql_output_3.json      |   4 +
 snippets/python/sql_test.py | 210 ++++++++++++++++++++++++++----------
 src/python/sql_test.py      | 198 +++++++++++++++++++++++++---------
 tasks.py                    |   2 +-
 9 files changed, 355 insertions(+), 113 deletions(-)
 create mode 100644 data/sql_input_2.sql
 create mode 100644 data/sql_input_3.sql
 create mode 100644 data/sql_output_2.json
 create mode 100644 data/sql_output_3.json

diff --git a/data/sql_input_2.sql b/data/sql_input_2.sql
new file mode 100644
index 0000000..89b50fb
--- /dev/null
+++ b/data/sql_input_2.sql
@@ -0,0 +1,21 @@
+-- https://cratedb.com/docs/sql-99/en/latest/chapters/01.html
+-- https://www.postgresql.org/docs/16/sql-createtable.html
+-- https://www.postgresql.org/docs/16/sql-insert.html
+-- https://www.postgresql.org/docs/16/sql-select.html
+CREATE TABLE city (
+    name VARCHAR,
+    population INT,
+    timezone INT
+);
+
+INSERT INTO city (name, population, timezone)
+VALUES ('San Francisco', 852469, -8);
+
+INSERT INTO city (name, population, timezone)
+VALUES ('New York', 8405837, -5);
+
+SELECT
+    name,
+    population,
+    timezone
+FROM city;
diff --git a/data/sql_input_3.sql b/data/sql_input_3.sql
new file mode 100644
index 0000000..df39515
--- /dev/null
+++ b/data/sql_input_3.sql
@@ -0,0 +1,21 @@
+-- https://cratedb.com/docs/sql-99/en/latest/chapters/01.html
+-- https://www.postgresql.org/docs/16/sql-createtable.html
+-- https://www.postgresql.org/docs/16/sql-insert.html
+-- https://www.postgresql.org/docs/16/sql-select.html
+CREATE TABLE city (
+    name VARCHAR,
+    population INT,
+    timezone INT
+);
+
+INSERT INTO city (name, timezone)
+VALUES ('San Francisco', -8);
+
+INSERT INTO city (name, population)
+VALUES ('New York', 8405837);
+
+SELECT
+    name,
+    population,
+    timezone
+FROM city;
diff --git a/data/sql_output_0.json b/data/sql_output_0.json
index eaa5d04..4775e53 100644
--- a/data/sql_output_0.json
+++ b/data/sql_output_0.json
@@ -1,3 +1 @@
-{
-  "table_name": ["city"]
-}
+[{ "table_name": "city" }]
diff --git a/data/sql_output_1.json b/data/sql_output_1.json
index 86970c5..223e140 100644
--- a/data/sql_output_1.json
+++ b/data/sql_output_1.json
@@ -1,3 +1 @@
-{
-  "table_name": ["city", "town"]
-}
+[{ "table_name": "city" }, { "table_name": "town" }]
diff --git a/data/sql_output_2.json b/data/sql_output_2.json
new file mode 100644
index 0000000..e00405c
--- /dev/null
+++ b/data/sql_output_2.json
@@ -0,0 +1,4 @@
+[
+  { "name": "San Francisco", "population": 852469, "timezone": -8 },
+  { "name": "New York", "population": 8405837, "timezone": -5 }
+]
diff --git a/data/sql_output_3.json b/data/sql_output_3.json
new file mode 100644
index 0000000..e1591f5
--- /dev/null
+++ b/data/sql_output_3.json
@@ -0,0 +1,4 @@
+[
+  { "name": "San Francisco", "population": null, "timezone": -8 },
+  { "name": "New York", "population": 8405837, "timezone": null }
+]
diff --git a/snippets/python/sql_test.py b/snippets/python/sql_test.py
index b9d290e..970bfa0 100644
--- a/snippets/python/sql_test.py
+++ b/snippets/python/sql_test.py
@@ -1,31 +1,119 @@
 
+import dataclasses
 import json
+import typing
+
+
+@dataclasses.dataclass(frozen=True)
+class SQLState:
+    state: dict
+
+    def read_table_meta(self, table_name: str) -> dict:
+        return self.state.get(table_name, {}).get("metadata", {})
+
+    def read_table_rows(self, table_name: str) -> list[dict]:
+        return self.state.get(table_name, {}).get("rows", [])
+
+    def read_information_schema(self) -> list[dict]:
+        return [data["metadata"] for data in self.state.values()]
+
+    def write_table_meta(self, table_name: str, data: dict):
+        state = self.state
+        table = state.get(table_name, {})
+        metadata = table.get("metadata", {})
+        metadata.update(data)
+        table["metadata"] = metadata
+        state[table_name] = table
+        return self.__class__(state)
+
+    def write_table_rows(self, table_name: str, data: dict):
+        state = self.state
+        table = state.get(table_name, {})
+        rows = table.get("rows", [])
+        rows.append(data)
+        table["rows"] = rows
+        state[table_name] = table
+        return self.__class__(state)
+
+
+class SQLType:
+    @staticmethod
+    def varchar(data) -> str:
+        data_str = str(data).strip()
+        if data_str.startswith("'") or data_str.startswith('"'):
+            data_str = data_str[1:]
+        if data_str.endswith("'") or data_str.endswith('"'):
+            data_str = data_str[:-1]
+        return data_str
+
+    @staticmethod
+    def int(data) -> int:
+        return int(data.strip())
+
+
+sql_type_map = {
+    "VARCHAR": SQLType.varchar,
+    "INT": SQLType.int,
+}
+
+
+class SQLFunctions:
+    @staticmethod
+    def create_table(state: SQLState, *args, table_schema="public") -> typing.Tuple[list, SQLState]:
+        output: list[dict] = []
+        table_name = args[2]
 
+        # get columns
+        columns = {}
+        columns_str = " ".join(args[3:]).replace("(", "").replace(")", "").strip()
+        if columns_str:
+            # fmt: off
+            columns = {
+                column.strip().split(" ")[0]: column.strip().split(" ")[1]
+                for column in columns_str.split(",")
+            }
+            # fmt: on
 
-class SQL:
-    data: dict = {}
-
-    def __init__(self) -> None:
-        self.data = {}
-
-    def information_schema_tables(self) -> list[dict]:
-        return [data["metadata"] for data in self.data.values()]
-
-    def create_table(self, *args, table_schema="public") -> dict:
-        table_name = args[2]
-        if not self.data.get(table_name):
-            self.data[table_name] = {
-                "metadata": {
+        if not state.read_table_meta(table_name):
+            state = state.write_table_meta(
+                table_name,
+                {
                     "table_name": table_name,
                     "table_schema": table_schema,
+                    "colums": columns,
                 },
-            }
-        return {}
+            )
+        return (output, state)
+
+    @staticmethod
+    def insert_into(state: SQLState, *args) -> typing.Tuple[list, SQLState]:
+        output: list[dict] = []
+        table_name = args[2]
+
+        values_index = None
+        for i, arg in enumerate(args):
+            if arg == "VALUES":
+                values_index = i
+        if values_index is None:
+            raise ValueError("VALUES not found")
+
+        keys = " ".join(args[3:values_index]).replace("(", "").replace(")", "").split(",")
+        keys = [key.strip() for key in keys]
+        values = " ".join(args[values_index + 1 :]).replace("(", "").replace(")", "").split(",")
+        values = [value.strip() for value in values]
+        key_value_map = dict(zip(keys, values))
 
-    create_table.sql = "CREATE TABLE"
+        data = {}
+        if metadata := state.read_table_meta(table_name):
+            for key, value in key_value_map.items():
+                data[key] = sql_type_map[metadata["colums"][key]](value)
+            state = state.write_table_rows(table_name, data)
 
-    def select(self, *args) -> dict:
-        output = {}
+        return (output, state)
+
+    @staticmethod
+    def select(state: SQLState, *args) -> typing.Tuple[list, SQLState]:
+        output: list[dict] = []
 
         from_index = None
         where_index = None
@@ -34,49 +122,59 @@ def select(self, *args) -> dict:
                 from_index = i
             if arg == "WHERE":
                 where_index = i
+        if from_index is None:
+            raise ValueError("FROM not found")
 
         # get select keys by getting the slice of args before FROM
         select_keys = " ".join(args[1:from_index]).split(",")
+        select_keys = [key.strip() for key in select_keys]
 
         # get where keys by getting the slice of args after WHERE
         from_value = args[from_index + 1]
 
-        # consider "information_schema.tables" a special case until
-        # we figure out why its so different from the others
+        # `information_schema.tables` is a special case
         if from_value == "information_schema.tables":
-            target = self.information_schema_tables()
-
-        # fmt: off
-        output = {
-            key: [
-                value for data in target
-                for key, value in data.items()
-                if key in select_keys
-            ]
-            for key in select_keys
-        }
-        # fmt: on
-
-        return output
-
-    select.sql = "SELECT"
-
-    sql_map = {
-        create_table.sql: create_table,
-        select.sql: select,
-    }
-
-    def run(self, input_sql: list[str]) -> list[str]:
-        output = {}
-
-        for line in input_sql:
-            if not line.startswith("--"):
-                words = line.split(" ")
-                for i in reversed(range(len(words))):
-                    key = " ".join(words[:i])
-                    if func := self.sql_map.get(key):
-                        output = func(self, *words)
-                        break
-
-        return [json.dumps(output)]
+            data = state.read_information_schema()
+        else:
+            data = state.read_table_rows(from_value)
+
+        output = []
+        for datum in data:
+            # fmt: off
+            output.append({
+                key: datum.get(key)
+                for key in select_keys
+            })
+            # fmt: on
+
+        return (output, state)
+
+
+sql_function_map: dict[str, typing.Callable] = {
+    "CREATE TABLE": SQLFunctions.create_table,
+    "SELECT": SQLFunctions.select,
+    "INSERT INTO": SQLFunctions.insert_into,
+}
+
+
+def run_sql(input_sql: list[str]) -> list[str]:
+    output = []
+    state = SQLState(state={})
+
+    # remove comments
+    input_sql = [line.strip() for line in input_sql if not line.startswith("--")]
+
+    # re-split on semi-colons
+    input_sql = " ".join(input_sql).split(";")
+
+    # iterate over each line of sql
+    for line in input_sql:
+        words = line.split(" ")
+        for i in reversed(range(len(words) + 1)):
+            key = " ".join(words[:i]).strip()
+            if func := sql_function_map.get(key):
+                output, state = func(state, *[word for word in words if word])
+                break
+
+    return [json.dumps(output)]
 
diff --git a/src/python/sql_test.py b/src/python/sql_test.py
index 7f41e13..0e6df4f 100644
--- a/src/python/sql_test.py
+++ b/src/python/sql_test.py
@@ -9,33 +9,121 @@
 ########################
 
 
+import dataclasses
 import json
+import typing
+
+
+@dataclasses.dataclass(frozen=True)
+class SQLState:
+    state: dict
+
+    def read_table_meta(self, table_name: str) -> dict:
+        return self.state.get(table_name, {}).get("metadata", {})
+
+    def read_table_rows(self, table_name: str) -> list[dict]:
+        return self.state.get(table_name, {}).get("rows", [])
+
+    def read_information_schema(self) -> list[dict]:
+        return [data["metadata"] for data in self.state.values()]
+
+    def write_table_meta(self, table_name: str, data: dict):
+        state = self.state
+        table = state.get(table_name, {})
+        metadata = table.get("metadata", {})
+        metadata.update(data)
+        table["metadata"] = metadata
+        state[table_name] = table
+        return self.__class__(state)
+
+    def write_table_rows(self, table_name: str, data: dict):
+        state = self.state
+        table = state.get(table_name, {})
+        rows = table.get("rows", [])
+        rows.append(data)
+        table["rows"] = rows
+        state[table_name] = table
+        return self.__class__(state)
+
+
+class SQLType:
+    @staticmethod
+    def varchar(data) -> str:
+        data_str = str(data).strip()
+        if data_str.startswith("'") or data_str.startswith('"'):
+            data_str = data_str[1:]
+        if data_str.endswith("'") or data_str.endswith('"'):
+            data_str = data_str[:-1]
+        return data_str
+
+    @staticmethod
+    def int(data) -> int:
+        return int(data.strip())
+
+
+sql_type_map = {
+    "VARCHAR": SQLType.varchar,
+    "INT": SQLType.int,
+}
+
+
+class SQLFunctions:
+    @staticmethod
+    def create_table(state: SQLState, *args, table_schema="public") -> typing.Tuple[list, SQLState]:
+        output: list[dict] = []
+        table_name = args[2]
 
+        # get columns
+        columns = {}
+        columns_str = " ".join(args[3:]).replace("(", "").replace(")", "").strip()
+        if columns_str:
+            # fmt: off
+            columns = {
+                column.strip().split(" ")[0]: column.strip().split(" ")[1]
+                for column in columns_str.split(",")
+            }
+            # fmt: on
 
-class SQL:
-    data: dict = {}
-
-    def __init__(self) -> None:
-        self.data = {}
-
-    def information_schema_tables(self) -> list[dict]:
-        return [data["metadata"] for data in self.data.values()]
-
-    def create_table(self, *args, table_schema="public") -> dict:
-        table_name = args[2]
-        if not self.data.get(table_name):
-            self.data[table_name] = {
-                "metadata": {
+        if not state.read_table_meta(table_name):
+            state = state.write_table_meta(
+                table_name,
+                {
                     "table_name": table_name,
                     "table_schema": table_schema,
+                    "colums": columns,
                 },
-            }
-        return {}
+            )
+        return (output, state)
+
+    @staticmethod
+    def insert_into(state: SQLState, *args) -> typing.Tuple[list, SQLState]:
+        output: list[dict] = []
+        table_name = args[2]
+
+        values_index = None
+        for i, arg in enumerate(args):
+            if arg == "VALUES":
+                values_index = i
+        if values_index is None:
+            raise ValueError("VALUES not found")
+
+        keys = " ".join(args[3:values_index]).replace("(", "").replace(")", "").split(",")
+        keys = [key.strip() for key in keys]
+        values = " ".join(args[values_index + 1 :]).replace("(", "").replace(")", "").split(",")
+        values = [value.strip() for value in values]
+        key_value_map = dict(zip(keys, values))
 
-    create_table.sql = "CREATE TABLE"
+        data = {}
+        if metadata := state.read_table_meta(table_name):
+            for key, value in key_value_map.items():
+                data[key] = sql_type_map[metadata["colums"][key]](value)
+            state = state.write_table_rows(table_name, data)
 
-    def select(self, *args) -> dict:
-        output = {}
+        return (output, state)
+
+    @staticmethod
+    def select(state: SQLState, *args) -> typing.Tuple[list, SQLState]:
+        output: list[dict] = []
 
         from_index = None
         where_index = None
@@ -44,51 +132,61 @@ def select(self, *args) -> dict:
                 from_index = i
             if arg == "WHERE":
                 where_index = i
+        if from_index is None:
+            raise ValueError("FROM not found")
 
         # get select keys by getting the slice of args before FROM
         select_keys = " ".join(args[1:from_index]).split(",")
+        select_keys = [key.strip() for key in select_keys]
 
         # get where keys by getting the slice of args after WHERE
         from_value = args[from_index + 1]
 
-        # consider "information_schema.tables" a special case until
-        # we figure out why its so different from the others
+        # `information_schema.tables` is a special case
         if from_value == "information_schema.tables":
-            target = self.information_schema_tables()
+            data = state.read_information_schema()
+        else:
+            data = state.read_table_rows(from_value)
+
+        output = []
+        for datum in data:
+            # fmt: off
+            output.append({
+                key: datum.get(key)
+                for key in select_keys
+            })
+            # fmt: on
+
+        return (output, state)
+
 
-        # fmt: off
-        output = {
-            key: [
-                value for data in target
-                for key, value in data.items()
-                if key in select_keys
-            ]
-            for key in select_keys
-        }
-        # fmt: on
+sql_function_map: dict[str, typing.Callable] = {
+    "CREATE TABLE": SQLFunctions.create_table,
+    "SELECT": SQLFunctions.select,
+    "INSERT INTO": SQLFunctions.insert_into,
+}
 
-        return output
 
-    select.sql = "SELECT"
+def run_sql(input_sql: list[str]) -> list[str]:
+    output = []
+    state = SQLState(state={})
 
-    sql_map = {
-        create_table.sql: create_table,
-        select.sql: select,
-    }
+    # remove comments
+    input_sql = [line.strip() for line in input_sql if not line.startswith("--")]
 
-    def run(self, input_sql: list[str]) -> list[str]:
-        output = {}
+    # re-split on semi-colons
+    input_sql = " ".join(input_sql).split(";")
 
-        for line in input_sql:
-            if not line.startswith("--"):
-                words = line.split(" ")
-                for i in reversed(range(len(words))):
-                    key = " ".join(words[:i])
-                    if func := self.sql_map.get(key):
-                        output = func(self, *words)
-                        break
+    # iterate over each line of sql
+    for line in input_sql:
+        words = line.split(" ")
+        for i in reversed(range(len(words) + 1)):
+            key = " ".join(words[:i]).strip()
+            if func := sql_function_map.get(key):
+                output, state = func(state, *[word for word in words if word])
+                break
 
-        return [json.dumps(output)]
+    return [json.dumps(output)]
 
 
 ######################
@@ -96,4 +194,4 @@ def run(self, input_sql: list[str]) -> list[str]:
 ######################
 
 if __name__ == "__main__":
-    helpers.run(SQL().run)
+    helpers.run(run_sql)
diff --git a/tasks.py b/tasks.py
index f104be7..1012564 100644
--- a/tasks.py
+++ b/tasks.py
@@ -300,7 +300,7 @@ def run_tests(self, input_script):
                         prepared_file_data = json.load(reader)
                     with open(ctx.script_output_file_path, "r", encoding="utf-8") as reader:
                         script_output_file_data = json.load(reader)
-                    unittest.TestCase().assertDictEqual(prepared_file_data, script_output_file_data)
+                    unittest.TestCase().assertListEqual(prepared_file_data, script_output_file_data)
                     self.set_success_status(True)
                     print(f"\t🟢 {ctx.script_relative_path} on {ctx.input_file_path} succeeded")
                     continue