From 82b1f806c1b0cf8e076976398a86731a230437d5 Mon Sep 17 00:00:00 2001 From: Lev Date: Thu, 14 Apr 2022 13:12:00 -0700 Subject: [PATCH] lint --- pgml/pgml/model.py | 151 +++++++++++++++++++++++++-------------- pgml/pgml/sql.py | 1 + pgml/tests/plpy.py | 7 +- pgml/tests/test_model.py | 136 +++++++++++++++++++++++++++++++---- 4 files changed, 226 insertions(+), 69 deletions(-) diff --git a/pgml/pgml/model.py b/pgml/pgml/model.py index b34145aca..753df0141 100644 --- a/pgml/pgml/model.py +++ b/pgml/pgml/model.py @@ -9,10 +9,11 @@ from pgml.exceptions import PgMLException from pgml.sql import q + class Project(object): """ Use projects to refine multiple models of a particular dataset on a specific objective. - + Attributes: id (int): a unique identifier name (str): a human friendly unique identifier @@ -20,7 +21,7 @@ class Project(object): created_at (Timestamp): when this project was created updated_at (Timestamp): when this project was last updated """ - + _cache = {} def __init__(self): @@ -36,11 +37,14 @@ def find(cls, id: int): Returns: Project or None: instantiated from the database if found """ - result = plpy.execute(f""" + result = plpy.execute( + f""" SELECT * FROM pgml.projects WHERE id = {q(id)} - """, 1) + """, + 1, + ) if len(result) == 0: return None @@ -53,11 +57,11 @@ def find(cls, id: int): @classmethod def find_by_name(cls, name: str): """ - Get a Project from the database by name. - + Get a Project from the database by name. + This is the prefered API to retrieve projects, and they are cached by name to avoid needing to go to he database on every usage. - + Args: name (str): the project name Returns: @@ -65,13 +69,16 @@ def find_by_name(cls, name: str): """ if name in cls._cache: return cls._cache[name] - - result = plpy.execute(f""" + + result = plpy.execute( + f""" SELECT * FROM pgml.projects WHERE name = {q(name)} - """, 1) - if len(result)== 0: + """, + 1, + ) + if len(result) == 0: return None project = Project() @@ -84,7 +91,7 @@ def find_by_name(cls, name: str): def create(cls, name: str, objective: str): """ Create a Project and save it to the database. - + Args: name (str): a human friendly identifier objective (str): valid values are ["regression", "classification"]. @@ -93,11 +100,16 @@ def create(cls, name: str, objective: str): """ project = Project() - project.__dict__ = dict(plpy.execute(f""" + project.__dict__ = dict( + plpy.execute( + f""" INSERT INTO pgml.projects (name, objective) VALUES ({q(name)}, {q(objective)}) RETURNING * - """, 1)[0]) + """, + 1, + )[0] + ) project.__init__() cls._cache[name] = project return project @@ -112,10 +124,11 @@ def deployed_model(self): self._deployed_model = Model.find_deployed(self.id) return self._deployed_model + class Snapshot(object): """ Snapshots capture a set of training & test data for repeatability. - + Attributes: id (int): a unique identifier relation_name (str): the name of the table or view to snapshot @@ -126,11 +139,18 @@ class Snapshot(object): created_at (Timestamp): when this snapshot was created updated_at (Timestamp): when this snapshot was last updated """ + @classmethod - def create(cls, relation_name: str, y_column_name: str, test_size: float or int, test_sampling: str): + def create( + cls, + relation_name: str, + y_column_name: str, + test_size: float or int, + test_sampling: str, + ): """ - Create a Snapshot and save it to the database. - + Create a Snapshot and save it to the database. + This creates both a metadata record in the snapshots table, as well as creating a new table that holds a snapshot of all the data currently present in the relation so that training runs may be repeated, or further analysis may be conducted against the input. @@ -145,21 +165,33 @@ def create(cls, relation_name: str, y_column_name: str, test_size: float or int, """ snapshot = Snapshot() - snapshot.__dict__ = dict(plpy.execute(f""" + snapshot.__dict__ = dict( + plpy.execute( + f""" INSERT INTO pgml.snapshots (relation_name, y_column_name, test_size, test_sampling, status) VALUES ({q(relation_name)}, {q(y_column_name)}, {q(test_size)}, {q(test_sampling)}, 'new') RETURNING * - """, 1)[0]) - plpy.execute(f""" + """, + 1, + )[0] + ) + plpy.execute( + f""" CREATE TABLE pgml."snapshot_{snapshot.id}" AS SELECT * FROM "{snapshot.relation_name}"; - """) - snapshot.__dict__ = dict(plpy.execute(f""" + """ + ) + snapshot.__dict__ = dict( + plpy.execute( + f""" UPDATE pgml.snapshots SET status = 'created' WHERE id = {q(snapshot.id)} RETURNING * - """, 1)[0]) + """, + 1, + )[0] + ) return snapshot def data(self): @@ -167,10 +199,12 @@ def data(self): Returns: list, list, list, list: All rows from the snapshot split into X_train, X_test, y_train, y_test sets. """ - data = plpy.execute(f""" + data = plpy.execute( + f""" SELECT * FROM pgml."snapshot_{self.id}" - """) + """ + ) print(data) # Sanity check the data @@ -203,10 +237,10 @@ def data(self): y.append(y_) # Split into training and test sets - if self.test_sampling == 'random': + if self.test_sampling == "random": return train_test_split(X, y, test_size=self.test_size, random_state=0) else: - if self.test_sampling == 'first': + if self.test_sampling == "first": X.reverse() y.reverse() if isinstance(split, float): @@ -216,9 +250,9 @@ def data(self): split = int(self.test_size * X.len()) return X[:split], X[split:], y[:split], y[split:] - # TODO normalize and clean data + class Model(object): """Models use an algorithm on a snapshot of data to record the parameters learned. @@ -234,23 +268,26 @@ class Model(object): pickle (bytes): the serialized version of the model parameters algorithm: the in memory version of the model parameters that can make predictions """ + @classmethod def create(cls, project: Project, snapshot: Snapshot, algorithm_name: str): """ Create a Model and save it to the database. - + Args: - project (str): - snapshot (str): + project (str): + snapshot (str): algorithm_name (str): Returns: Model: instantiated from the database """ - result = plpy.execute(f""" + result = plpy.execute( + f""" INSERT INTO pgml.models (project_id, snapshot_id, algorithm_name, status) VALUES ({q(project.id)}, {q(snapshot.id)}, {q(algorithm_name)}, 'new') RETURNING * - """) + """ + ) model = Model() model.__dict__ = dict(result[0]) model.__init__() @@ -265,7 +302,8 @@ def find_deployed(cls, project_id: int): Returns: Model: that should currently be used for predictions of the project """ - result = plpy.execute(f""" + result = plpy.execute( + f""" SELECT models.* FROM pgml.models JOIN pgml.deployments @@ -273,7 +311,8 @@ def find_deployed(cls, project_id: int): AND deployments.project_id = {q(project_id)} ORDER by deployments.created_at DESC LIMIT 1 - """) + """ + ) if len(result) == 0: return None @@ -303,19 +342,19 @@ def algorithm(self): self._algorithm = pickle.loads(self.pickle) else: self._algorithm = { - 'linear_regression': LinearRegression, - 'random_forest_regression': RandomForestRegressor, - 'random_forest_classification': RandomForestClassifier - }[self.algorithm_name + '_' + self.project.objective]() - + "linear_regression": LinearRegression, + "random_forest_regression": RandomForestRegressor, + "random_forest_classification": RandomForestClassifier, + }[self.algorithm_name + "_" + self.project.objective]() + return self._algorithm def fit(self, snapshot: Snapshot): """ - Learns the parameters of this model and records them in the database. + Learns the parameters of this model and records them in the database. - Args: - snapshot (Snapshot): dataset used to train this model + Args: + snapshot (Snapshot): dataset used to train this model """ X_train, X_test, y_train, y_test = snapshot.data() @@ -328,7 +367,9 @@ def fit(self, snapshot: Snapshot): r2 = r2_score(y_test, y_pred) # Save the model - self.__dict__ = dict(plpy.execute(f""" + self.__dict__ = dict( + plpy.execute( + f""" UPDATE pgml.models SET pickle = '\\x{pickle.dumps(self.algorithm).hex()}', status = 'successful', @@ -336,14 +377,18 @@ def fit(self, snapshot: Snapshot): r2_score = {q(r2)} WHERE id = {q(self.id)} RETURNING * - """)[0]) + """ + )[0] + ) def deploy(self): """Promote this model to the active version for the project that will be used for predictions""" - plpy.execute(f""" + plpy.execute( + f""" INSERT INTO pgml.deployments (project_id, model_id) VALUES ({q(self.project_id)}, {q(self.id)}) - """) + """ + ) def predict(self, data: list): """Use the model for a set of features. @@ -358,12 +403,12 @@ def predict(self, data: list): def train( - project_name: str, + project_name: str, objective: str, - relation_name: str, - y_column_name: str, + relation_name: str, + y_column_name: str, test_size: float or int = 0.1, - test_sampling: str = "random" + test_sampling: str = "random", ): """Create a regression model from a table or view filled with training data. @@ -390,5 +435,5 @@ def train( model.fit(snapshot) if best_error is None or model.mean_squared_error < best_error: best_error = model.mean_squared_error - best_model = model + best_model = model best_model.deploy() diff --git a/pgml/pgml/sql.py b/pgml/pgml/sql.py index 79ab69bdc..d8866d6c1 100644 --- a/pgml/pgml/sql.py +++ b/pgml/pgml/sql.py @@ -1,5 +1,6 @@ from plpy import quote_literal + def q(obj): if type(obj) == str: return quote_literal(obj) diff --git a/pgml/tests/plpy.py b/pgml/tests/plpy.py index 4bbbbc6fd..122092550 100644 --- a/pgml/tests/plpy.py +++ b/pgml/tests/plpy.py @@ -2,15 +2,18 @@ execute_results = deque() + def quote_literal(literal): return "'" + literal + "'" -def execute(sql, lines = 0): + +def execute(sql, lines=0): if len(execute_results) > 0: result = execute_results.popleft() return result - else: + else: return [] + def add_mock_result(result): execute_results.append(result) diff --git a/pgml/tests/test_model.py b/pgml/tests/test_model.py index 02605982d..cd7d26867 100644 --- a/pgml/tests/test_model.py +++ b/pgml/tests/test_model.py @@ -1,28 +1,71 @@ # stub out plpy from . import plpy import sys -sys.modules['plpy'] = plpy + +sys.modules["plpy"] = plpy import time import unittest from pgml import model + class TestModel(unittest.TestCase): def test_the_world(self): plpy.add_mock_result( - [{"id": 1, "name": "Test", "objective": "regression", "created_at": time.time(), "updated_at": time.time()}] - ) - plpy.add_mock_result( - [{"id": 1, "relation_name": "test", "y_column_name": "test_y", "test_size": 0.1, "test_sampling": "random", "status": "new", "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "name": "Test", + "objective": "regression", + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( - "OK" + [ + { + "id": 1, + "relation_name": "test", + "y_column_name": "test_y", + "test_size": 0.1, + "test_sampling": "random", + "status": "new", + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) + plpy.add_mock_result("OK") plpy.add_mock_result( - [{"id": 1, "relation_name": "test", "y_column_name": "test_y", "test_size": 0.1, "test_sampling": "random", "status": "created", "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "relation_name": "test", + "y_column_name": "test_y", + "test_size": 0.1, + "test_sampling": "random", + "status": "created", + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( [ @@ -32,11 +75,37 @@ def test_the_world(self): ] ) plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( [ @@ -46,11 +115,37 @@ def test_the_world(self): ] ) plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) - + plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( [ @@ -60,6 +155,19 @@ def test_the_world(self): ] ) plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) model.train("Test", "regression", "test", "test_y")