From 82b1f806c1b0cf8e076976398a86731a230437d5 Mon Sep 17 00:00:00 2001
From: Lev <lev.kokotov@gmail.com>
Date: Thu, 14 Apr 2022 13:12:00 -0700
Subject: [PATCH] lint

---
 pgml/pgml/model.py       | 151 +++++++++++++++++++++++++--------------
 pgml/pgml/sql.py         |   1 +
 pgml/tests/plpy.py       |   7 +-
 pgml/tests/test_model.py | 136 +++++++++++++++++++++++++++++++----
 4 files changed, 226 insertions(+), 69 deletions(-)

diff --git a/pgml/pgml/model.py b/pgml/pgml/model.py
index b34145aca..753df0141 100644
--- a/pgml/pgml/model.py
+++ b/pgml/pgml/model.py
@@ -9,10 +9,11 @@
 from pgml.exceptions import PgMLException
 from pgml.sql import q
 
+
 class Project(object):
     """
     Use projects to refine multiple models of a particular dataset on a specific objective.
-    
+
     Attributes:
         id (int): a unique identifier
         name (str): a human friendly unique identifier
@@ -20,7 +21,7 @@ class Project(object):
         created_at (Timestamp): when this project was created
         updated_at (Timestamp): when this project was last updated
     """
-    
+
     _cache = {}
 
     def __init__(self):
@@ -36,11 +37,14 @@ def find(cls, id: int):
         Returns:
             Project or None: instantiated from the database if found
         """
-        result = plpy.execute(f"""
+        result = plpy.execute(
+            f"""
             SELECT * 
             FROM pgml.projects 
             WHERE id = {q(id)}
-        """, 1)
+        """,
+            1,
+        )
         if len(result) == 0:
             return None
 
@@ -53,11 +57,11 @@ def find(cls, id: int):
     @classmethod
     def find_by_name(cls, name: str):
         """
-        Get a Project from the database by name. 
-        
+        Get a Project from the database by name.
+
         This is the prefered API to retrieve projects, and they are cached by
         name to avoid needing to go to he database on every usage.
-        
+
         Args:
             name (str): the project name
         Returns:
@@ -65,13 +69,16 @@ def find_by_name(cls, name: str):
         """
         if name in cls._cache:
             return cls._cache[name]
-    
-        result = plpy.execute(f"""
+
+        result = plpy.execute(
+            f"""
             SELECT * 
             FROM pgml.projects 
             WHERE name = {q(name)}
-        """, 1)
-        if len(result)== 0:
+        """,
+            1,
+        )
+        if len(result) == 0:
             return None
 
         project = Project()
@@ -84,7 +91,7 @@ def find_by_name(cls, name: str):
     def create(cls, name: str, objective: str):
         """
         Create a Project and save it to the database.
-        
+
         Args:
             name (str): a human friendly identifier
             objective (str): valid values are ["regression", "classification"].
@@ -93,11 +100,16 @@ def create(cls, name: str, objective: str):
         """
 
         project = Project()
-        project.__dict__ = dict(plpy.execute(f"""
+        project.__dict__ = dict(
+            plpy.execute(
+                f"""
             INSERT INTO pgml.projects (name, objective) 
             VALUES ({q(name)}, {q(objective)}) 
             RETURNING *
-        """, 1)[0])
+        """,
+                1,
+            )[0]
+        )
         project.__init__()
         cls._cache[name] = project
         return project
@@ -112,10 +124,11 @@ def deployed_model(self):
             self._deployed_model = Model.find_deployed(self.id)
         return self._deployed_model
 
+
 class Snapshot(object):
     """
     Snapshots capture a set of training & test data for repeatability.
-    
+
     Attributes:
         id (int): a unique identifier
         relation_name (str): the name of the table or view to snapshot
@@ -126,11 +139,18 @@ class Snapshot(object):
         created_at (Timestamp): when this snapshot was created
         updated_at (Timestamp): when this snapshot was last updated
     """
+
     @classmethod
-    def create(cls, relation_name: str, y_column_name: str, test_size: float or int, test_sampling: str):
+    def create(
+        cls,
+        relation_name: str,
+        y_column_name: str,
+        test_size: float or int,
+        test_sampling: str,
+    ):
         """
-        Create a Snapshot and save it to the database. 
-        
+        Create a Snapshot and save it to the database.
+
         This creates both a metadata record in the snapshots table, as well as creating a new table
         that holds a snapshot of all the data currently present in the relation so that training
         runs may be repeated, or further analysis may be conducted against the input.
@@ -145,21 +165,33 @@ def create(cls, relation_name: str, y_column_name: str, test_size: float or int,
         """
 
         snapshot = Snapshot()
-        snapshot.__dict__ = dict(plpy.execute(f"""
+        snapshot.__dict__ = dict(
+            plpy.execute(
+                f"""
             INSERT INTO pgml.snapshots (relation_name, y_column_name, test_size, test_sampling, status)
             VALUES ({q(relation_name)}, {q(y_column_name)}, {q(test_size)}, {q(test_sampling)}, 'new')
             RETURNING *
-        """, 1)[0])
-        plpy.execute(f"""
+        """,
+                1,
+            )[0]
+        )
+        plpy.execute(
+            f"""
             CREATE TABLE pgml."snapshot_{snapshot.id}" AS 
             SELECT * FROM "{snapshot.relation_name}";
-        """)
-        snapshot.__dict__ = dict(plpy.execute(f"""
+        """
+        )
+        snapshot.__dict__ = dict(
+            plpy.execute(
+                f"""
             UPDATE pgml.snapshots 
             SET status = 'created' 
             WHERE id = {q(snapshot.id)} 
             RETURNING *
-        """, 1)[0])
+        """,
+                1,
+            )[0]
+        )
         return snapshot
 
     def data(self):
@@ -167,10 +199,12 @@ def data(self):
         Returns:
             list, list, list, list: All rows from the snapshot split into X_train, X_test, y_train, y_test sets.
         """
-        data = plpy.execute(f"""
+        data = plpy.execute(
+            f"""
             SELECT * 
             FROM pgml."snapshot_{self.id}"
-        """)
+        """
+        )
 
         print(data)
         # Sanity check the data
@@ -203,10 +237,10 @@ def data(self):
             y.append(y_)
 
         # Split into training and test sets
-        if self.test_sampling == 'random':
+        if self.test_sampling == "random":
             return train_test_split(X, y, test_size=self.test_size, random_state=0)
         else:
-            if self.test_sampling == 'first':
+            if self.test_sampling == "first":
                 X.reverse()
                 y.reverse()
                 if isinstance(split, float):
@@ -216,9 +250,9 @@ def data(self):
                 split = int(self.test_size * X.len())
             return X[:split], X[split:], y[:split], y[split:]
 
-
         # TODO normalize and clean data
 
+
 class Model(object):
     """Models use an algorithm on a snapshot of data to record the parameters learned.
 
@@ -234,23 +268,26 @@ class Model(object):
         pickle (bytes): the serialized version of the model parameters
         algorithm: the in memory version of the model parameters that can make predictions
     """
+
     @classmethod
     def create(cls, project: Project, snapshot: Snapshot, algorithm_name: str):
         """
         Create a Model and save it to the database.
-        
+
         Args:
-            project (str): 
-            snapshot (str): 
+            project (str):
+            snapshot (str):
             algorithm_name (str):
         Returns:
             Model: instantiated from the database
         """
-        result = plpy.execute(f"""
+        result = plpy.execute(
+            f"""
             INSERT INTO pgml.models (project_id, snapshot_id, algorithm_name, status) 
             VALUES ({q(project.id)}, {q(snapshot.id)}, {q(algorithm_name)}, 'new') 
             RETURNING *
-        """)
+        """
+        )
         model = Model()
         model.__dict__ = dict(result[0])
         model.__init__()
@@ -265,7 +302,8 @@ def find_deployed(cls, project_id: int):
         Returns:
             Model: that should currently be used for predictions of the project
         """
-        result = plpy.execute(f"""
+        result = plpy.execute(
+            f"""
             SELECT models.* 
             FROM pgml.models 
             JOIN pgml.deployments 
@@ -273,7 +311,8 @@ def find_deployed(cls, project_id: int):
               AND deployments.project_id = {q(project_id)}
             ORDER by deployments.created_at DESC
             LIMIT 1
-        """)
+        """
+        )
         if len(result) == 0:
             return None
 
@@ -303,19 +342,19 @@ def algorithm(self):
                 self._algorithm = pickle.loads(self.pickle)
             else:
                 self._algorithm = {
-                    'linear_regression': LinearRegression,
-                    'random_forest_regression': RandomForestRegressor,
-                    'random_forest_classification': RandomForestClassifier
-                }[self.algorithm_name + '_' + self.project.objective]()
-    
+                    "linear_regression": LinearRegression,
+                    "random_forest_regression": RandomForestRegressor,
+                    "random_forest_classification": RandomForestClassifier,
+                }[self.algorithm_name + "_" + self.project.objective]()
+
         return self._algorithm
 
     def fit(self, snapshot: Snapshot):
         """
-            Learns the parameters of this model and records them in the database.
+        Learns the parameters of this model and records them in the database.
 
-            Args:
-                snapshot (Snapshot): dataset used to train this model
+        Args:
+            snapshot (Snapshot): dataset used to train this model
         """
         X_train, X_test, y_train, y_test = snapshot.data()
 
@@ -328,7 +367,9 @@ def fit(self, snapshot: Snapshot):
         r2 = r2_score(y_test, y_pred)
 
         # Save the model
-        self.__dict__ = dict(plpy.execute(f"""
+        self.__dict__ = dict(
+            plpy.execute(
+                f"""
             UPDATE pgml.models
             SET pickle = '\\x{pickle.dumps(self.algorithm).hex()}',
                 status = 'successful',
@@ -336,14 +377,18 @@ def fit(self, snapshot: Snapshot):
                 r2_score = {q(r2)}
             WHERE id = {q(self.id)}
             RETURNING *
-        """)[0])
+        """
+            )[0]
+        )
 
     def deploy(self):
         """Promote this model to the active version for the project that will be used for predictions"""
-        plpy.execute(f"""
+        plpy.execute(
+            f"""
             INSERT INTO pgml.deployments (project_id, model_id) 
             VALUES ({q(self.project_id)}, {q(self.id)})
-        """)
+        """
+        )
 
     def predict(self, data: list):
         """Use the model for a set of features.
@@ -358,12 +403,12 @@ def predict(self, data: list):
 
 
 def train(
-    project_name: str, 
+    project_name: str,
     objective: str,
-    relation_name: str, 
-    y_column_name: str, 
+    relation_name: str,
+    y_column_name: str,
     test_size: float or int = 0.1,
-    test_sampling: str = "random"
+    test_sampling: str = "random",
 ):
     """Create a regression model from a table or view filled with training data.
 
@@ -390,5 +435,5 @@ def train(
         model.fit(snapshot)
         if best_error is None or model.mean_squared_error < best_error:
             best_error = model.mean_squared_error
-            best_model = model        
+            best_model = model
     best_model.deploy()
diff --git a/pgml/pgml/sql.py b/pgml/pgml/sql.py
index 79ab69bdc..d8866d6c1 100644
--- a/pgml/pgml/sql.py
+++ b/pgml/pgml/sql.py
@@ -1,5 +1,6 @@
 from plpy import quote_literal
 
+
 def q(obj):
     if type(obj) == str:
         return quote_literal(obj)
diff --git a/pgml/tests/plpy.py b/pgml/tests/plpy.py
index 4bbbbc6fd..122092550 100644
--- a/pgml/tests/plpy.py
+++ b/pgml/tests/plpy.py
@@ -2,15 +2,18 @@
 
 execute_results = deque()
 
+
 def quote_literal(literal):
     return "'" + literal + "'"
 
-def execute(sql, lines = 0):
+
+def execute(sql, lines=0):
     if len(execute_results) > 0:
         result = execute_results.popleft()
         return result
-    else: 
+    else:
         return []
 
+
 def add_mock_result(result):
     execute_results.append(result)
diff --git a/pgml/tests/test_model.py b/pgml/tests/test_model.py
index 02605982d..cd7d26867 100644
--- a/pgml/tests/test_model.py
+++ b/pgml/tests/test_model.py
@@ -1,28 +1,71 @@
 # stub out plpy
 from . import plpy
 import sys
-sys.modules['plpy'] = plpy
+
+sys.modules["plpy"] = plpy
 
 import time
 import unittest
 from pgml import model
 
+
 class TestModel(unittest.TestCase):
     def test_the_world(self):
         plpy.add_mock_result(
-            [{"id": 1, "name": "Test", "objective": "regression", "created_at": time.time(), "updated_at": time.time()}]
-        )
-        plpy.add_mock_result(
-            [{"id": 1, "relation_name": "test", "y_column_name": "test_y", "test_size": 0.1, "test_sampling": "random", "status": "new", "created_at": time.time(), "updated_at": time.time()}]
+            [
+                {
+                    "id": 1,
+                    "name": "Test",
+                    "objective": "regression",
+                    "created_at": time.time(),
+                    "updated_at": time.time(),
+                }
+            ]
         )
         plpy.add_mock_result(
-            "OK"
+            [
+                {
+                    "id": 1,
+                    "relation_name": "test",
+                    "y_column_name": "test_y",
+                    "test_size": 0.1,
+                    "test_sampling": "random",
+                    "status": "new",
+                    "created_at": time.time(),
+                    "updated_at": time.time(),
+                }
+            ]
         )
+        plpy.add_mock_result("OK")
         plpy.add_mock_result(
-            [{"id": 1, "relation_name": "test", "y_column_name": "test_y", "test_size": 0.1, "test_sampling": "random", "status": "created", "created_at": time.time(), "updated_at": time.time()}]
+            [
+                {
+                    "id": 1,
+                    "relation_name": "test",
+                    "y_column_name": "test_y",
+                    "test_size": 0.1,
+                    "test_sampling": "random",
+                    "status": "created",
+                    "created_at": time.time(),
+                    "updated_at": time.time(),
+                }
+            ]
         )
         plpy.add_mock_result(
-            [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}]
+            [
+                {
+                    "id": 1,
+                    "project_id": 1,
+                    "snapshot_id": 1,
+                    "algorithm_name": "linear",
+                    "status": "new",
+                    "r2_score": None,
+                    "mean_squared_error": None,
+                    "pickle": None,
+                    "created_at": time.time(),
+                    "updated_at": time.time(),
+                }
+            ]
         )
         plpy.add_mock_result(
             [
@@ -32,11 +75,37 @@ def test_the_world(self):
             ]
         )
         plpy.add_mock_result(
-            [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}]
+            [
+                {
+                    "id": 1,
+                    "project_id": 1,
+                    "snapshot_id": 1,
+                    "algorithm_name": "linear",
+                    "status": "new",
+                    "r2_score": None,
+                    "mean_squared_error": None,
+                    "pickle": None,
+                    "created_at": time.time(),
+                    "updated_at": time.time(),
+                }
+            ]
         )
 
         plpy.add_mock_result(
-            [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}]
+            [
+                {
+                    "id": 1,
+                    "project_id": 1,
+                    "snapshot_id": 1,
+                    "algorithm_name": "linear",
+                    "status": "new",
+                    "r2_score": None,
+                    "mean_squared_error": None,
+                    "pickle": None,
+                    "created_at": time.time(),
+                    "updated_at": time.time(),
+                }
+            ]
         )
         plpy.add_mock_result(
             [
@@ -46,11 +115,37 @@ def test_the_world(self):
             ]
         )
         plpy.add_mock_result(
-            [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}]
+            [
+                {
+                    "id": 1,
+                    "project_id": 1,
+                    "snapshot_id": 1,
+                    "algorithm_name": "linear",
+                    "status": "new",
+                    "r2_score": None,
+                    "mean_squared_error": None,
+                    "pickle": None,
+                    "created_at": time.time(),
+                    "updated_at": time.time(),
+                }
+            ]
         )
-        
+
         plpy.add_mock_result(
-            [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}]
+            [
+                {
+                    "id": 1,
+                    "project_id": 1,
+                    "snapshot_id": 1,
+                    "algorithm_name": "linear",
+                    "status": "new",
+                    "r2_score": None,
+                    "mean_squared_error": None,
+                    "pickle": None,
+                    "created_at": time.time(),
+                    "updated_at": time.time(),
+                }
+            ]
         )
         plpy.add_mock_result(
             [
@@ -60,6 +155,19 @@ def test_the_world(self):
             ]
         )
         plpy.add_mock_result(
-            [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}]
+            [
+                {
+                    "id": 1,
+                    "project_id": 1,
+                    "snapshot_id": 1,
+                    "algorithm_name": "linear",
+                    "status": "new",
+                    "r2_score": None,
+                    "mean_squared_error": None,
+                    "pickle": None,
+                    "created_at": time.time(),
+                    "updated_at": time.time(),
+                }
+            ]
         )
         model.train("Test", "regression", "test", "test_y")