From 32d46f9f8730d07538bb02a01062157c869d4119 Mon Sep 17 00:00:00 2001
From: Montana Low
Date: Fri, 19 Aug 2022 18:32:37 -0700
Subject: [PATCH] add binary classification notebook
---
.../fixtures/{fraud.yml => notebooks.yml} | 1045 +++++++++++++----
1 file changed, 793 insertions(+), 252 deletions(-)
rename pgml-dashboard/notebooks/fixtures/{fraud.yml => notebooks.yml} (51%)
diff --git a/pgml-dashboard/notebooks/fixtures/fraud.yml b/pgml-dashboard/notebooks/fixtures/notebooks.yml
similarity index 51%
rename from pgml-dashboard/notebooks/fixtures/fraud.yml
rename to pgml-dashboard/notebooks/fixtures/notebooks.yml
index a1b4c25fc..eb748ba40 100644
--- a/pgml-dashboard/notebooks/fixtures/fraud.yml
+++ b/pgml-dashboard/notebooks/fixtures/notebooks.yml
@@ -4,173 +4,14 @@
name: 'Tutorial 1: Real Time Fraud Detection'
created_at: 2022-08-15 22:26:18.428227+00:00
updated_at: 2022-08-15 22:26:18.428241+00:00
-- model: notebooks.notebookcell
- pk: 12
- fields:
- notebook: 1
- cell_type: 1
- contents: ''
- rendering:
- execution_time: null
- cell_number: 4
- version: 1
- deleted_at: 2022-08-16 21:32:21.265114+00:00
-- model: notebooks.notebookcell
- pk: 13
- fields:
- notebook: 1
- cell_type: 1
- contents: asdf
- rendering: asdf
- execution_time: null
- cell_number: 5
- version: 1
- deleted_at: 2022-08-16 21:32:23.210172+00:00
-- model: notebooks.notebookcell
- pk: 14
- fields:
- notebook: 1
- cell_type: 3
- contents: asfd
- rendering: null
- execution_time: null
- cell_number: 6
- version: 1
- deleted_at: 2022-08-16 21:32:25.059892+00:00
-- model: notebooks.notebookcell
- pk: 16
- fields:
- notebook: 1
- cell_type: 1
- contents: asdf
- rendering: asdf
- execution_time: null
- cell_number: 8
- version: 1
- deleted_at: 2022-08-16 21:32:27.489903+00:00
-- model: notebooks.notebookcell
- pk: 18
- fields:
- notebook: 1
- cell_type: 1
- contents: asdf
- rendering: asdf
- execution_time: null
- cell_number: 9
- version: 2
- deleted_at: 2022-08-16 21:32:26.408082+00:00
-- model: notebooks.notebookcell
- pk: 19
- fields:
- notebook: 1
- cell_type: 1
- contents: ''
- rendering:
- execution_time: null
- cell_number: 10
- version: 1
- deleted_at: 2022-08-16 21:32:29.396631+00:00
-- model: notebooks.notebookcell
- pk: 23
- fields:
- notebook: 1
- cell_type: 1
- contents: test
- rendering: test
- execution_time: null
- cell_number: 12
- version: 1
- deleted_at: 2022-08-16 21:32:39.748416+00:00
-- model: notebooks.notebookcell
- pk: 24
- fields:
- notebook: 1
- cell_type: 1
- contents: ''
- rendering:
- execution_time: null
- cell_number: 13
- version: 1
- deleted_at: 2022-08-16 21:32:42.950765+00:00
-- model: notebooks.notebookcell
- pk: 25
- fields:
- notebook: 1
- cell_type: 3
- contents: ''
- rendering: null
- execution_time: null
- cell_number: 14
- version: 1
- deleted_at: 2022-08-16 21:32:43.832970+00:00
-- model: notebooks.notebookcell
- pk: 26
- fields:
- notebook: 1
- cell_type: 1
- contents: asdf
- rendering: asdf
- execution_time: null
- cell_number: 7
- version: 2
- deleted_at: 2022-08-16 21:32:25.873904+00:00
-- model: notebooks.notebookcell
- pk: 29
- fields:
- notebook: 1
- cell_type: 3
- contents: select 1
- rendering: null
- execution_time: null
- cell_number: 11
- version: 4
- deleted_at: 2022-08-16 21:32:39.009549+00:00
-- model: notebooks.notebookcell
- pk: 31
- fields:
- notebook: 1
- cell_type: 3
- contents: "INSERT INTO PRODUCTS (emoji, name, price, perishable) \nVALUES\n ('\U0001F4B0',
- '1oz gold bar', '$1999.99', false),\n ('\U0001F4D5', 'a tale of 2 cities',
- '$19.99', false),\n ('\U0001F96C', 'head of lettuce', '$1.99', true)\nRETURNING
- *;"
- rendering: null
- execution_time: null
- cell_number: 3
- version: 5
- deleted_at: null
-- model: notebooks.notebookcell
- pk: 33
- fields:
- notebook: 1
- cell_type: 1
- contents: "Now that we're in business, our first customer has shown up, named
- Alice. She is a chef that owns a salad shop, so she is going to create an order
- for 1,000 \U0001F96C `head of lettuce`.\n\nOur ecommerce site will record `orders`
- and their `line_items` in our database with the following schema."
- rendering: "Now that we're in business, our
- first customer has shown up, named Alice. She is a chef that owns a salad shop,
- so she is going to create an order for 1,000 \U0001F96C head of lettuce.
\nOur
- ecommerce site will record orders and their line_items
- in our database with the following schema.
"
- execution_time: null
- cell_number: 4
- version: 2
- deleted_at: null
-- model: notebooks.notebookcell
- pk: 35
+- model: notebooks.notebook
+ pk: 2
fields:
- notebook: 1
- cell_type: 3
- contents: "CREATE TABLE products (\n emoji TEXT PRIMARY KEY,\n name TEXT,\n
- \ price MONEY,\n perishable BOOLEAN\n);"
- rendering: null
- execution_time: null
- cell_number: 2
- version: 9
- deleted_at: null
+ name: 'Tutorial 2: Tumor Detection w/ Binary Classification'
+ created_at: 2022-08-19 23:10:23.120983+00:00
+ updated_at: 2022-08-19 23:10:23.120996+00:00
- model: notebooks.notebookcell
- pk: 36
+ pk: 1
fields:
notebook: 1
cell_type: 1
@@ -217,6 +58,8 @@
- Part 4: Adding More Features
+ - Part 5: Upgrading the Machine Learning Algorithm
+
Part 1: Ecommerce Application Data Model
@@ -263,6 +106,8 @@
Part 4: Adding More Features
+ Part 5: Upgrading the Machine Learning Algorithm
+
Part 1: Ecommerce Application Data Model
@@ -272,21 +117,54 @@
price, and other metadata, like whether or not they are perishable goods.
'
execution_time: null
cell_number: 1
- version: 4
+ version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 37
+ pk: 2
+ fields:
+ notebook: 1
+ cell_type: 3
+ contents: "CREATE TABLE products (\n emoji TEXT PRIMARY KEY,\n name TEXT,\n
+ \ price MONEY,\n perishable BOOLEAN\n);"
+ rendering: null
+ execution_time: null
+ cell_number: 2
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 3
+ fields:
+ notebook: 1
+ cell_type: 3
+ contents: "INSERT INTO PRODUCTS (emoji, name, price, perishable) \nVALUES\n ('\U0001F4B0',
+ '1oz gold bar', '$1999.99', false),\n ('\U0001F4D5', 'a tale of 2 cities',
+ '$19.99', false),\n ('\U0001F96C', 'head of lettuce', '$1.99', true)\nRETURNING
+ *;"
+ rendering: null
+ execution_time: null
+ cell_number: 3
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 4
fields:
notebook: 1
cell_type: 1
- contents: ters
- rendering: ters
+ contents: "Now that we're in business, our first customer has shown up, named
+ Alice. She is a chef that owns a salad shop, so she is going to create an order
+ for 1,000 \U0001F96C `head of lettuce`.\n\nOur ecommerce site will record `orders`
+ and their `line_items` in our database with the following schema."
+ rendering: "Now that we're in business, our
+ first customer has shown up, named Alice. She is a chef that owns a salad shop,
+ so she is going to create an order for 1,000 \U0001F96C head of lettuce.
\nOur
+ ecommerce site will record orders and their line_items
+ in our database with the following schema.
"
execution_time: null
- cell_number: 5
+ cell_number: 4
version: 1
- deleted_at: 2022-08-18 15:01:02.379644+00:00
+ deleted_at: null
- model: notebooks.notebookcell
- pk: 38
+ pk: 5
fields:
notebook: 1
cell_type: 3
@@ -299,7 +177,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 39
+ pk: 6
fields:
notebook: 1
cell_type: 1
@@ -311,7 +189,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 40
+ pk: 7
fields:
notebook: 1
cell_type: 3
@@ -325,7 +203,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 41
+ pk: 8
fields:
notebook: 1
cell_type: 1
@@ -347,7 +225,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 42
+ pk: 9
fields:
notebook: 1
cell_type: 3
@@ -359,7 +237,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 43
+ pk: 10
fields:
notebook: 1
cell_type: 1
@@ -372,7 +250,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 44
+ pk: 11
fields:
notebook: 1
cell_type: 3
@@ -388,7 +266,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 45
+ pk: 12
fields:
notebook: 1
cell_type: 1
@@ -404,7 +282,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 46
+ pk: 13
fields:
notebook: 1
cell_type: 3
@@ -417,7 +295,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 47
+ pk: 14
fields:
notebook: 1
cell_type: 1
@@ -430,7 +308,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 48
+ pk: 15
fields:
notebook: 1
cell_type: 3
@@ -446,7 +324,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 49
+ pk: 16
fields:
notebook: 1
cell_type: 1
@@ -461,7 +339,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 50
+ pk: 17
fields:
notebook: 1
cell_type: 3
@@ -473,7 +351,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 51
+ pk: 18
fields:
notebook: 1
cell_type: 1
@@ -485,7 +363,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 52
+ pk: 19
fields:
notebook: 1
cell_type: 3
@@ -498,7 +376,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 53
+ pk: 20
fields:
notebook: 1
cell_type: 1
@@ -523,7 +401,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 54
+ pk: 21
fields:
notebook: 1
cell_type: 3
@@ -540,7 +418,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 55
+ pk: 22
fields:
notebook: 1
cell_type: 1
@@ -551,7 +429,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 56
+ pk: 23
fields:
notebook: 1
cell_type: 3
@@ -562,7 +440,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 57
+ pk: 24
fields:
notebook: 1
cell_type: 1
@@ -583,7 +461,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 58
+ pk: 25
fields:
notebook: 1
cell_type: 3
@@ -600,7 +478,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 59
+ pk: 26
fields:
notebook: 1
cell_type: 3
@@ -611,7 +489,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 60
+ pk: 27
fields:
notebook: 1
cell_type: 1
@@ -642,7 +520,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 61
+ pk: 28
fields:
notebook: 1
cell_type: 3
@@ -658,7 +536,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 62
+ pk: 29
fields:
notebook: 1
cell_type: 1
@@ -705,7 +583,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 63
+ pk: 30
fields:
notebook: 1
cell_type: 3
@@ -719,7 +597,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 64
+ pk: 31
fields:
notebook: 1
cell_type: 1
@@ -731,7 +609,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 65
+ pk: 32
fields:
notebook: 1
cell_type: 3
@@ -747,7 +625,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 66
+ pk: 33
fields:
notebook: 1
cell_type: 1
@@ -759,7 +637,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 67
+ pk: 34
fields:
notebook: 1
cell_type: 3
@@ -772,7 +650,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 68
+ pk: 35
fields:
notebook: 1
cell_type: 1
@@ -783,7 +661,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 69
+ pk: 36
fields:
notebook: 1
cell_type: 3
@@ -799,7 +677,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 70
+ pk: 37
fields:
notebook: 1
cell_type: 1
@@ -812,7 +690,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 71
+ pk: 38
fields:
notebook: 1
cell_type: 3
@@ -825,7 +703,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 72
+ pk: 39
fields:
notebook: 1
cell_type: 1
@@ -837,7 +715,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 73
+ pk: 40
fields:
notebook: 1
cell_type: 3
@@ -853,7 +731,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 74
+ pk: 41
fields:
notebook: 1
cell_type: 1
@@ -866,7 +744,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 75
+ pk: 42
fields:
notebook: 1
cell_type: 3
@@ -879,7 +757,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 76
+ pk: 43
fields:
notebook: 1
cell_type: 1
@@ -896,7 +774,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 77
+ pk: 44
fields:
notebook: 1
cell_type: 3
@@ -910,7 +788,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 78
+ pk: 45
fields:
notebook: 1
cell_type: 1
@@ -940,7 +818,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 79
+ pk: 46
fields:
notebook: 1
cell_type: 3
@@ -959,7 +837,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 80
+ pk: 47
fields:
notebook: 1
cell_type: 1
@@ -972,7 +850,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 81
+ pk: 48
fields:
notebook: 1
cell_type: 3
@@ -980,15 +858,15 @@
-- a friendly name we'll use to identify this machine learning project\n task
=> 'classification', -- we want to classify into true or false\n relation_name
=> 'fraud_samples', -- our view of the data\n y_column_name => 'fraudulent',
- -- the \"labels\"\n test_size => 0.5 -- use half the data for testing rather
- than the default test size of 25%\n);"
+ -- the \"labels\"\n test_sampling => 'last',\n test_size => 0.5 -- use half
+ the data for testing rather than the default test size of 25%\n);"
rendering: null
execution_time: null
cell_number: 48
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 82
+ pk: 49
fields:
notebook: 1
cell_type: 1
@@ -1000,7 +878,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 83
+ pk: 50
fields:
notebook: 1
cell_type: 3
@@ -1011,7 +889,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 84
+ pk: 51
fields:
notebook: 1
cell_type: 1
@@ -1023,7 +901,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 85
+ pk: 52
fields:
notebook: 1
cell_type: 3
@@ -1036,7 +914,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 86
+ pk: 53
fields:
notebook: 1
cell_type: 1
@@ -1095,29 +973,36 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 87
+ pk: 54
fields:
notebook: 1
cell_type: 1
- contents: When you're out of ideas for features that might help the model distinguish
+ contents: 'Part 5: Upgrading the Machine Learning Algorithm
+
+ ------------------------------------------
+
+
+ When you''re out of ideas for features that might help the model distinguish
orders that are likely to result in chargebacks, you may want to start testing
different algorithms to see how the performance changes. PostgresML makes algorithm
selection as easy as passing an additional parameter to `pgml.train`. You may
want to test them all just to see, but `xgboost` typically gives excellent performance
- in terms of both accuracy and latency.
- rendering: When you're out of ideas for features
- that might help the model distinguish orders that are likely to result in chargebacks,
- you may want to start testing different algorithms to see how the performance
- changes. PostgresML makes algorithm selection as easy as passing an additional
- parameter to pgml.train. You may want to test them all just to
- see, but xgboost typically gives excellent performance in terms
- of both accuracy and latency.
+ in terms of both accuracy and latency.'
+ rendering: 'Part 5: Upgrading the Machine Learning
+ Algorithm
+
+ When you''re out of ideas for features that might help the model distinguish
+ orders that are likely to result in chargebacks, you may want to start testing
+ different algorithms to see how the performance changes. PostgresML makes algorithm
+ selection as easy as passing an additional parameter to pgml.train.
+ You may want to test them all just to see, but xgboost typically
+ gives excellent performance in terms of both accuracy and latency.
'
execution_time: null
cell_number: 54
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 88
+ pk: 55
fields:
notebook: 1
cell_type: 3
@@ -1134,33 +1019,26 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 89
+ pk: 56
fields:
notebook: 1
cell_type: 1
- contents: 'Using Regression instead of Classificaiton
-
- ------------------------------------------
-
-
- So far we''ve been training a classifier that gives us a binary 0 or 1 output
- to indicate likely fraud or not. If we''d like to refine our application response
+ contents: So far we've been training a classifier that gives us a binary 0 or
+ 1 output to indicate fraud or not. If we'd like to refine our application response
to the models predictions in a more nuanced way, say high/medium/low risk instead
of binary, we can use "regression" instead of "classification" to predict a
- likelihood between 0 and 1, instead of binary.'
- rendering: 'Using Regression instead of Classificaiton
-
- So far we''ve been training a classifier that gives us a binary 0 or 1 output
- to indicate likely fraud or not. If we''d like to refine our application response
- to the models predictions in a more nuanced way, say high/medium/low risk instead
- of binary, we can use "regression" instead of "classification" to predict a
- likelihood between 0 and 1, instead of binary.
'
+ likelihood between 0 and 1, instead of binary.
+ rendering: So far we've been training a classifier
+ that gives us a binary 0 or 1 output to indicate fraud or not. If we'd like
+ to refine our application response to the models predictions in a more nuanced
+ way, say high/medium/low risk instead of binary, we can use "regression" instead
+ of "classification" to predict a likelihood between 0 and 1, instead of binary.
execution_time: null
cell_number: 56
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 90
+ pk: 57
fields:
notebook: 1
cell_type: 3
@@ -1176,7 +1054,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 91
+ pk: 58
fields:
notebook: 1
cell_type: 1
@@ -1195,7 +1073,7 @@
version: 1
deleted_at: null
- model: notebooks.notebookcell
- pk: 92
+ pk: 59
fields:
notebook: 1
cell_type: 3
@@ -1209,3 +1087,666 @@
cell_number: 59
version: 1
deleted_at: null
+- model: notebooks.notebookcell
+ pk: 60
+ fields:
+ notebook: 2
+ cell_type: 1
+ contents: 'Binary classification means categorizing data into 2 categories. Usually
+ these are categories like:
+
+
+ - `True` or `False`
+
+ - `0` or `1`
+
+ - `hot_dog` or `not_hot_dog`
+
+
+ These categories divide a population into things we care about, and things we
+ can ignore. Binary classification is a common task for machine learning models.
+ It can be applied across a broad set of scenarios, once you understand the way
+ to structure your problem as a set of example data with labeled outcomes.
+
+
+ In this tutorial, we''ll train models using various "supervised learning" algorithms
+ to classify medical samples as benign or malignant. Supervised learning techniques
+ require us to label the sample data for the algorithm to learn how the inputs
+ correlate with the labels. After the algorithm has been trained on the labeled
+ data set we created, we can present it with new unlabeled data to classify based
+ on the most likely outcome.
+
+
+ As we saw in [Tutorial 1: Real Time Fraud Model](/notebooks/notebook/1) understanding
+ the structure of the data and the labels is a complex and critical step for
+ real world machine learning projects. In this example we''ll focus more on the
+ different algorithms, and use an academic benchmark dataset that already includes
+ binary labels from UCI ML Breast Cancer Wisconsin. Features were computed from
+ a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe
+ characteristics of the cell nuclei present in the image. The labels are either
+ True for a malginant sample of False for a benign sample.
+
+
+ You can load this dataset into your Postgres database with the following SQL.'
+ rendering: 'Binary classification means categorizing
+ data into 2 categories. Usually these are categories like:
+
+
+
+ True or False
+
+ 0 or 1
+
+ hot_dog or not_hot_dog
+
+
+
+ These categories divide a population into things we care about, and things
+ we can ignore. Binary classification is a common task for machine learning models.
+ It can be applied across a broad set of scenarios, once you understand the way
+ to structure your problem as a set of example data with labeled outcomes.
+
+ In this tutorial, we''ll train models using various "supervised learning"
+ algorithms to classify medical samples as benign or malignant. Supervised learning
+ techniques require us to label the sample data for the algorithm to learn how
+ the inputs correlate with the labels. After the algorithm has been trained on
+ the labeled data set we created, we can present it with new unlabeled data to
+ classify based on the most likely outcome.
+
+ As we saw in Tutorial 1: Real Time Fraud
+ Model understanding the structure of the data and the labels is a complex
+ and critical step for real world machine learning projects. In this example
+ we''ll focus more on the different algorithms, and use an academic benchmark
+ dataset that already includes binary labels from UCI ML Breast Cancer Wisconsin.
+ Features were computed from a digitized image of a fine needle aspirate (FNA)
+ of a breast mass. They describe characteristics of the cell nuclei present in
+ the image. The labels are either True for a malginant sample of False for a
+ benign sample.
+
+ You can load this dataset into your Postgres database with the following
+ SQL.
'
+ execution_time: null
+ cell_number: 1
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 61
+ fields:
+ notebook: 2
+ cell_type: 3
+ contents: SELECT pgml.load_dataset('breast_cancer');
+ rendering: "\n
\n \n \n \n
+ \ | load_dataset | \n \n
\n \n
+ \ \n \n \n \n | OK | \n \n
\n \n
+ \ \n
\n
\n"
+ execution_time: '00:00:02.498819'
+ cell_number: 2
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 62
+ fields:
+ notebook: 2
+ cell_type: 1
+ contents: This function has created a new table in your database named `pgml.breast_cancer`.
+ Let's look at a random sample of the data with some more SQL.
+ rendering: This function has created a new table
+ in your database named pgml.breast_cancer. Let's look at a random
+ sample of the data with some more SQL.
+ execution_time: null
+ cell_number: 3
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 63
+ fields:
+ notebook: 2
+ cell_type: 3
+ contents: "SELECT * \nFROM pgml.breast_cancer \nORDER BY random()\nLIMIT 10;"
+ rendering: "\n
\n \n \n \n
+ \ | mean radius | \n \n mean
+ texture | \n \n mean perimeter | \n
+ \ \n mean area | \n \n mean
+ smoothness | \n \n mean compactness | \n
+ \ \n mean concavity | \n \n mean
+ concave points | \n \n mean symmetry | \n
+ \ \n mean fractal dimension | \n \n radius
+ error | \n \n texture error | \n
+ \ \n perimeter error | \n \n area
+ error | \n \n smoothness error | \n
+ \ \n compactness error | \n \n concavity
+ error | \n \n concave points error | \n
+ \ \n symmetry error | \n \n fractal
+ dimension error | \n \n worst radius | \n
+ \ \n worst texture | \n \n worst
+ perimeter | \n \n worst area | \n
+ \ \n worst smoothness | \n \n worst
+ compactness | \n \n worst concavity | \n
+ \ \n worst concave points | \n \n worst
+ symmetry | \n \n worst fractal dimension | \n
+ \ \n malignant | \n \n
\n \n
+ \ \n \n \n \n | 12.77 | \n \n 21.41 | \n
+ \ \n 82.02 | \n \n 507.4 | \n \n 0.08749 | \n
+ \ \n 0.06601 | \n \n 0.03112 | \n \n 0.02864 | \n
+ \ \n 0.1694 | \n \n 0.06287 | \n \n 0.7311 | \n
+ \ \n 1.748 | \n \n 5.118 | \n \n 53.65 | \n
+ \ \n 0.004571 | \n \n 0.0179 | \n \n 0.02176 | \n
+ \ \n 0.01757 | \n \n 0.03373 | \n \n 0.005875 | \n
+ \ \n 13.75 | \n \n 23.5 | \n \n 89.04 | \n
+ \ \n 579.5 | \n \n 0.09388 | \n \n 0.08978 | \n
+ \ \n 0.05186 | \n \n 0.04773 | \n \n 0.2179 | \n
+ \ \n 0.06871 | \n \n False | \n \n
\n
+ \ \n \n \n | 12.22 | \n \n 20.04 | \n
+ \ \n 79.47 | \n \n 453.1 | \n \n 0.1096 | \n
+ \ \n 0.1152 | \n \n 0.08175 | \n \n 0.02166 | \n
+ \ \n 0.2124 | \n \n 0.06894 | \n \n 0.1811 | \n
+ \ \n 0.7959 | \n \n 0.9857 | \n \n 12.58 | \n
+ \ \n 0.006272 | \n \n 0.02198 | \n \n 0.03966 | \n
+ \ \n 0.009894 | \n \n 0.0132 | \n \n 0.003813 | \n
+ \ \n 13.16 | \n \n 24.17 | \n \n 85.13 | \n
+ \ \n 515.3 | \n \n 0.1402 | \n \n 0.2315 | \n
+ \ \n 0.3535 | \n \n 0.08088 | \n \n 0.2709 | \n
+ \ \n 0.08839 | \n \n False | \n \n
\n
+ \ \n \n \n | 12.4 | \n \n 17.68 | \n
+ \ \n 81.47 | \n \n 467.8 | \n \n 0.1054 | \n
+ \ \n 0.1316 | \n \n 0.07741 | \n \n 0.02799 | \n
+ \ \n 0.1811 | \n \n 0.07102 | \n \n 0.1767 | \n
+ \ \n 1.46 | \n \n 2.204 | \n \n 15.43 | \n
+ \ \n 0.01 | \n \n 0.03295 | \n \n 0.04861 | \n
+ \ \n 0.01167 | \n \n 0.02187 | \n \n 0.006005 | \n
+ \ \n 12.88 | \n \n 22.91 | \n \n 89.61 | \n
+ \ \n 515.8 | \n \n 0.145 | \n \n 0.2629 | \n
+ \ \n 0.2403 | \n \n 0.0737 | \n \n 0.2556 | \n
+ \ \n 0.09359 | \n \n False | \n \n
\n
+ \ \n \n \n | 14.02 | \n \n 15.66 | \n
+ \ \n 89.59 | \n \n 606.5 | \n \n 0.07966 | \n
+ \ \n 0.05581 | \n \n 0.02087 | \n \n 0.02652 | \n
+ \ \n 0.1589 | \n \n 0.05586 | \n \n 0.2142 | \n
+ \ \n 0.6549 | \n \n 1.606 | \n \n 19.25 | \n
+ \ \n 0.004837 | \n \n 0.009238 | \n \n 0.009213 | \n
+ \ \n 0.01076 | \n \n 0.01171 | \n \n 0.002104 | \n
+ \ \n 14.91 | \n \n 19.31 | \n \n 96.53 | \n
+ \ \n 688.9 | \n \n 0.1034 | \n \n 0.1017 | \n
+ \ \n 0.0626 | \n \n 0.08216 | \n \n 0.2136 | \n
+ \ \n 0.0671 | \n \n False | \n \n
\n
+ \ \n \n \n | 19.59 | \n \n 18.15 | \n
+ \ \n 130.7 | \n \n 1214.0 | \n \n 0.112 | \n
+ \ \n 0.1666 | \n \n 0.2508 | \n \n 0.1286 | \n
+ \ \n 0.2027 | \n \n 0.06082 | \n \n 0.7364 | \n
+ \ \n 1.048 | \n \n 4.792 | \n \n 97.07 | \n
+ \ \n 0.004057 | \n \n 0.02277 | \n \n 0.04029 | \n
+ \ \n 0.01303 | \n \n 0.01686 | \n \n 0.003318 | \n
+ \ \n 26.73 | \n \n 26.39 | \n \n 174.9 | \n
+ \ \n 2232.0 | \n \n 0.1438 | \n \n 0.3846 | \n
+ \ \n 0.681 | \n \n 0.2247 | \n \n 0.3643 | \n
+ \ \n 0.09223 | \n \n True | \n \n
\n
+ \ \n \n \n | 8.726 | \n \n 15.83 | \n
+ \ \n 55.84 | \n \n 230.9 | \n \n 0.115 | \n
+ \ \n 0.08201 | \n \n 0.04132 | \n \n 0.01924 | \n
+ \ \n 0.1649 | \n \n 0.07633 | \n \n 0.1665 | \n
+ \ \n 0.5864 | \n \n 1.354 | \n \n 8.966 | \n
+ \ \n 0.008261 | \n \n 0.02213 | \n \n 0.03259 | \n
+ \ \n 0.0104 | \n \n 0.01708 | \n \n 0.003806 | \n
+ \ \n 9.628 | \n \n 19.62 | \n \n 64.48 | \n
+ \ \n 284.4 | \n \n 0.1724 | \n \n 0.2364 | \n
+ \ \n 0.2456 | \n \n 0.105 | \n \n 0.2926 | \n
+ \ \n 0.1017 | \n \n False | \n \n
\n
+ \ \n \n \n | 17.99 | \n \n 10.38 | \n
+ \ \n 122.8 | \n \n 1001.0 | \n \n 0.1184 | \n
+ \ \n 0.2776 | \n \n 0.3001 | \n \n 0.1471 | \n
+ \ \n 0.2419 | \n \n 0.07871 | \n \n 1.095 | \n
+ \ \n 0.9053 | \n \n 8.589 | \n \n 153.4 | \n
+ \ \n 0.006399 | \n \n 0.04904 | \n \n 0.05373 | \n
+ \ \n 0.01587 | \n \n 0.03003 | \n \n 0.006193 | \n
+ \ \n 25.38 | \n \n 17.33 | \n \n 184.6 | \n
+ \ \n 2019.0 | \n \n 0.1622 | \n \n 0.6656 | \n
+ \ \n 0.7119 | \n \n 0.2654 | \n \n 0.4601 | \n
+ \ \n 0.1189 | \n \n True | \n \n
\n
+ \ \n \n \n | 11.74 | \n \n 14.69 | \n
+ \ \n 76.31 | \n \n 426.0 | \n \n 0.08099 | \n
+ \ \n 0.09661 | \n \n 0.06726 | \n \n 0.02639 | \n
+ \ \n 0.1499 | \n \n 0.06758 | \n \n 0.1924 | \n
+ \ \n 0.6417 | \n \n 1.345 | \n \n 13.04 | \n
+ \ \n 0.006982 | \n \n 0.03916 | \n \n 0.04017 | \n
+ \ \n 0.01528 | \n \n 0.0226 | \n \n 0.006822 | \n
+ \ \n 12.45 | \n \n 17.6 | \n \n 81.25 | \n
+ \ \n 473.8 | \n \n 0.1073 | \n \n 0.2793 | \n
+ \ \n 0.269 | \n \n 0.1056 | \n \n 0.2604 | \n
+ \ \n 0.09879 | \n \n False | \n \n
\n
+ \ \n \n \n | 9.667 | \n \n 18.49 | \n
+ \ \n 61.49 | \n \n 289.1 | \n \n 0.08946 | \n
+ \ \n 0.06258 | \n \n 0.02948 | \n \n 0.01514 | \n
+ \ \n 0.2238 | \n \n 0.06413 | \n \n 0.3776 | \n
+ \ \n 1.35 | \n \n 2.569 | \n \n 22.73 | \n
+ \ \n 0.007501 | \n \n 0.01989 | \n \n 0.02714 | \n
+ \ \n 0.009883 | \n \n 0.0196 | \n \n 0.003913 | \n
+ \ \n 11.14 | \n \n 25.62 | \n \n 70.88 | \n
+ \ \n 385.2 | \n \n 0.1234 | \n \n 0.1542 | \n
+ \ \n 0.1277 | \n \n 0.0656 | \n \n 0.3174 | \n
+ \ \n 0.08524 | \n \n False | \n \n
\n
+ \ \n \n \n | 13.08 | \n \n 15.71 | \n
+ \ \n 85.63 | \n \n 520.0 | \n \n 0.1075 | \n
+ \ \n 0.127 | \n \n 0.04568 | \n \n 0.0311 | \n
+ \ \n 0.1967 | \n \n 0.06811 | \n \n 0.1852 | \n
+ \ \n 0.7477 | \n \n 1.383 | \n \n 14.67 | \n
+ \ \n 0.004097 | \n \n 0.01898 | \n \n 0.01698 | \n
+ \ \n 0.00649 | \n \n 0.01678 | \n \n 0.002425 | \n
+ \ \n 14.5 | \n \n 20.49 | \n \n 96.09 | \n
+ \ \n 630.5 | \n \n 0.1312 | \n \n 0.2776 | \n
+ \ \n 0.189 | \n \n 0.07283 | \n \n 0.3184 | \n
+ \ \n 0.08183 | \n \n False | \n \n
\n
+ \ \n \n
\n
\n"
+ execution_time: '00:00:00.007697'
+ cell_number: 4
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 64
+ fields:
+ notebook: 2
+ cell_type: 1
+ contents: 'That''s a lot of numeric feature data describing various attributes
+ of the cells, but if you scroll all the way to the right above, after running
+ the query, you''ll see that each sample set of feature data is labeled `malignant`
+ [`True` or `False`]. It would be extremely difficult for a human to study all
+ these numbers, and see how they correlate with malignant or not, and then be
+ able to make a prediction for new samples, but mathemeticians have been working
+ on algorithms to do exactly this using computers which happen to be exceptionally
+ good at this by now. This is statistical machine learning.
+
+
+ PostgresML makes it easy to use this data to create a model. It only takes a
+ single function call with a few parameters.'
+ rendering: 'That''s a lot of numeric feature
+ data describing various attributes of the cells, but if you scroll all the way
+ to the right above, after running the query, you''ll see that each sample set
+ of feature data is labeled malignant [True or False].
+ It would be extremely difficult for a human to study all these numbers, and
+ see how they correlate with malignant or not, and then be able to make a prediction
+ for new samples, but mathemeticians have been working on algorithms to do exactly
+ this using computers which happen to be exceptionally good at this by now. This
+ is statistical machine learning.
+
+ PostgresML makes it easy to use this data to create a model. It only takes
+ a single function call with a few parameters.
'
+ execution_time: null
+ cell_number: 5
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 65
+ fields:
+ notebook: 2
+ cell_type: 3
+ contents: "SELECT * FROM pgml.train(\n project_name => 'Breast Cancer Detection',
+ \n task => 'classification', \n relation_name => 'pgml.breast_cancer', \n
+ \ y_column_name => 'malignant'\n);"
+ rendering: "\n
\n \n \n \n
+ \ | project_name | \n \n task | \n
+ \ \n algorithm_name | \n \n status | \n
+ \ \n
\n \n \n \n \n \n | Breast
+ Cancer Detection | \n \n classification | \n \n linear | \n
+ \ \n not deployed | \n \n
\n \n \n
\n
\n"
+ execution_time: '00:00:02.802388'
+ cell_number: 6
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 66
+ fields:
+ notebook: 2
+ cell_type: 1
+ contents: "\U0001F3C1 Congratulations \U0001F3C1\n---------------------\n\nYou've
+ just created a machine learning model, tested it's accuracy, and deployed it
+ to production. PostgresML orchestrated a bunch of the traditional ML drudgery
+ in that couple of seconds to make it as simple as possible for you to get value.
+ We'll organize our work on this task under the project name \"Breast Cancer
+ Detection\", which you can now see it in your [list of projects](/projects/).
+ You can see that the first model uses the default linear algorithm, and that
+ it achieves an [F1 score](https://en.wikipedia.org/wiki/F-score) in the mid
+ 90's, which is pretty good. A score of 1.0 is perfect, and 0.5 would be as good
+ as random guessing. The better the F1 score, the better the algorithm can perform
+ on this dataset. \n\nWe can now use this model to make some predictions in real
+ time, using the training data as input to the `pgml.predict` function."
+ rendering: "\U0001F3C1 Congratulations \U0001F3C1
\nYou've
+ just created a machine learning model, tested it's accuracy, and deployed it
+ to production. PostgresML orchestrated a bunch of the traditional ML drudgery
+ in that couple of seconds to make it as simple as possible for you to get value.
+ We'll organize our work on this task under the project name \"Breast Cancer
+ Detection\", which you can now see it in your list of
+ projects. You can see that the first model uses the default linear algorithm,
+ and that it achieves an F1
+ score in the mid 90's, which is pretty good. A score of 1.0 is perfect,
+ and 0.5 would be as good as random guessing. The better the F1 score, the better
+ the algorithm can perform on this dataset.
\nWe can now use this model
+ to make some predictions in real time, using the training data as input to the
+ pgml.predict function.
"
+ execution_time: null
+ cell_number: 7
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 67
+ fields:
+ notebook: 2
+ cell_type: 3
+ contents: "SELECT malignant, pgml.predict(\n 'Breast Cancer Detection', \n
+ \ ARRAY[\n \"mean radius\", \n \"mean texture\", \n \"mean
+ perimeter\", \n \"mean area\",\n \"mean smoothness\",\n \"mean
+ compactness\",\n \"mean concavity\",\n \"mean concave points\",\n
+ \ \"mean symmetry\",\n \"mean fractal dimension\",\n \"radius
+ error\",\n \"texture error\",\n \"perimeter error\",\n \"area
+ error\",\n \"smoothness error\",\n \"compactness error\",\n \"concavity
+ error\",\n \"concave points error\",\n \"symmetry error\",\n \"fractal
+ dimension error\",\n \"worst radius\",\n \"worst texture\",\n
+ \ \"worst perimeter\",\n \"worst area\",\n \"worst smoothness\",\n
+ \ \"worst compactness\",\n \"worst concavity\",\n \"worst
+ concave points\",\n \"worst symmetry\",\n \"worst fractal dimension\"\n
+ \ ]\n) AS prediction\nFROM pgml.breast_cancer\nORDER BY random()\nLIMIT 10;"
+ rendering: "\n
\n \n \n \n
+ \ | malignant | \n \n prediction | \n
+ \ \n
\n \n \n \n \n \n | False | \n
+ \ \n 0.0 | \n \n
\n \n \n \n | True | \n
+ \ \n 1.0 | \n \n
\n \n \n \n | False | \n
+ \ \n 0.0 | \n \n
\n \n \n \n | False | \n
+ \ \n 0.0 | \n \n
\n \n \n \n | False | \n
+ \ \n 0.0 | \n \n
\n \n \n \n | False | \n
+ \ \n 0.0 | \n \n
\n \n \n \n | False | \n
+ \ \n 0.0 | \n \n
\n \n \n \n | True | \n
+ \ \n 1.0 | \n \n
\n \n \n \n | False | \n
+ \ \n 0.0 | \n \n
\n \n \n \n | True | \n
+ \ \n 1.0 | \n \n
\n \n \n
\n
\n"
+ execution_time: '00:00:02.657161'
+ cell_number: 8
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 68
+ fields:
+ notebook: 2
+ cell_type: 1
+ contents: "You can see the model is pretty good at predicting `0` for non malignant
+ samples, and `1` for malignant samples. This isn't a great test though, because
+ we're using the same data we trained with. We could have just looked up the
+ data in the database table if this is all we wanted to do. The point of training
+ a machine learning model, is to generalize these statistics to data we've never
+ seen before. What do you think this model would predict if all the input features
+ happened to be 0 or 1? How does that compare to what it's seen before? \n\nIt's
+ easy to test the model and see by providing new sample data in real time. There
+ are lots of ways we could feed new data to a model in Postgres. We could write
+ new samples to a table just like our training data, or we could pass parameters
+ directly into a query without recording anything in the database at all. Postgres
+ gives us a lot of ways to get data in and out at run time. We'll demonstrate
+ with a `VALUES` example for batch prediction."
+ rendering: 'You can see the model is pretty
+ good at predicting 0 for non malignant samples, and 1
+ for malignant samples. This isn''t a great test though, because we''re using
+ the same data we trained with. We could have just looked up the data in the
+ database table if this is all we wanted to do. The point of training a machine
+ learning model, is to generalize these statistics to data we''ve never seen
+ before. What do you think this model would predict if all the input features
+ happened to be 0 or 1? How does that compare to what it''s seen before?
+
+ It''s easy to test the model and see by providing new sample data in real
+ time. There are lots of ways we could feed new data to a model in Postgres.
+ We could write new samples to a table just like our training data, or we could
+ pass parameters directly into a query without recording anything in the database
+ at all. Postgres gives us a lot of ways to get data in and out at run time.
+ We''ll demonstrate with a VALUES example for batch prediction.
'
+ execution_time: null
+ cell_number: 9
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 69
+ fields:
+ notebook: 2
+ cell_type: 3
+ contents: "SELECT sample_name, pgml.predict(\n 'Breast Cancer Detection', \n
+ \ ARRAY[\n \"mean radius\", \n \"mean texture\", \n \"mean
+ perimeter\", \n \"mean area\",\n \"mean smoothness\",\n \"mean
+ compactness\",\n \"mean concavity\",\n \"mean concave points\",\n
+ \ \"mean symmetry\",\n \"mean fractal dimension\",\n \"radius
+ error\",\n \"texture error\",\n \"perimeter error\",\n \"area
+ error\",\n \"smoothness error\",\n \"compactness error\",\n \"concavity
+ error\",\n \"concave points error\",\n \"symmetry error\",\n \"fractal
+ dimension error\",\n \"worst radius\",\n \"worst texture\",\n
+ \ \"worst perimeter\",\n \"worst area\",\n \"worst smoothness\",\n
+ \ \"worst compactness\",\n \"worst concavity\",\n \"worst
+ concave points\",\n \"worst symmetry\",\n \"worst fractal dimension\"\n
+ \ ]\n) AS prediction\nFROM (\n VALUES \n \t('all_zeroes',0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),\n
+ \ \t('all_ones', 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)\n)
+ \n AS t (\n \"sample_name\",\n \"mean radius\", \n \"mean texture\",
+ \n \"mean perimeter\", \n \"mean area\",\n \"mean smoothness\",\n \"mean
+ compactness\",\n \"mean concavity\",\n \"mean concave points\",\n \"mean
+ symmetry\",\n \"mean fractal dimension\",\n \"radius error\",\n \"texture
+ error\",\n \"perimeter error\",\n \"area error\",\n \"smoothness error\",\n
+ \ \"compactness error\",\n \"concavity error\",\n \"concave points error\",\n
+ \ \"symmetry error\",\n \"fractal dimension error\",\n \"worst radius\",\n
+ \ \"worst texture\",\n \"worst perimeter\",\n \"worst area\",\n \"worst
+ smoothness\",\n \"worst compactness\",\n \"worst concavity\",\n \"worst
+ concave points\",\n \"worst symmetry\",\n \"worst fractal dimension\"\n
+ \ );"
+ rendering: "\n
\n \n \n \n
+ \ | sample_name | \n \n prediction | \n
+ \ \n
\n \n \n \n \n \n | all_zeroes | \n
+ \ \n 0.0 | \n \n
\n \n \n \n | all_ones | \n
+ \ \n 0.0 | \n \n
\n \n \n
\n
\n"
+ execution_time: '00:00:02.626657'
+ cell_number: 10
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 70
+ fields:
+ notebook: 2
+ cell_type: 1
+ contents: 'Even though the inputs are not data we''ve ever seen before, the model
+ is telling us both of these new samples are likely to be benign based on their
+ statistcal correlations to the training samples we had labelled. As we collect
+ new data samples, we could potentially use this model for multiple purposes,
+ like screening the samples before doing further more expensive or invasive analysis.
+
+
+ To demonstrate a more concise call that omits all the feature names (careful
+ to get the order right):'
+ rendering: 'Even though the inputs are not data
+ we''ve ever seen before, the model is telling us both of these new samples are
+ likely to be benign based on their statistcal correlations to the training samples
+ we had labelled. As we collect new data samples, we could potentially use this
+ model for multiple purposes, like screening the samples before doing further
+ more expensive or invasive analysis.
+
+ To demonstrate a more concise call that omits all the feature names (careful
+ to get the order right):
'
+ execution_time: null
+ cell_number: 11
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 71
+ fields:
+ notebook: 2
+ cell_type: 3
+ contents: "SELECT pgml.predict(\n 'Breast Cancer Detection', \n ARRAY[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100000]\n)"
+ rendering: "\n
\n \n \n \n
+ \ | predict | \n \n
\n \n \n
+ \ \n \n \n | 1.0 | \n \n
\n \n \n
\n
\n"
+ execution_time: '00:00:02.643660'
+ cell_number: 12
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 72
+ fields:
+ notebook: 2
+ cell_type: 1
+ contents: 'Ah hah! We put a really big number into the last feature (worst fractal
+ dimension), and got the model to give us a `True` prediction, indicating that
+ large values there correlate with a malignant sample all else being equal using
+ our default linear algorithm. There are lots of ways we can probe the model
+ with test data, but before we spend too much time on this one, it might be informative
+ to try other algorithms.
+
+
+ PostgresML makes it easy to reuse your training data with many of the best algorithms
+ available. Why not try them all?'
+ rendering: 'Ah hah! We put a really big number
+ into the last feature (worst fractal dimension), and got the model to give us
+ a True prediction, indicating that large values there correlate
+ with a malignant sample all else being equal using our default linear algorithm.
+ There are lots of ways we can probe the model with test data, but before we
+ spend too much time on this one, it might be informative to try other algorithms.
+
+ PostgresML makes it easy to reuse your training data with many of the best
+ algorithms available. Why not try them all?
'
+ execution_time: null
+ cell_number: 13
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 73
+ fields:
+ notebook: 2
+ cell_type: 3
+ contents: '--
+
+ -- After a project has been trained, ommited parameters will be reused from
+ previous training runs
+
+ -- In these examples we''ll reuse the training data snapshots from the initial
+ call.
+
+ --
+
+
+ -- Linear Models
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''ridge'');
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''stochastic_gradient_descent'');
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''perceptron'');
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''passive_aggressive'');
+
+
+ -- Support Vector Machines
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''svm'');
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''nu_svm'');
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''linear_svm'');
+
+
+ -- Ensembles
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''ada_boost'');
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''bagging'');
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''extra_trees'',
+ hyperparams => ''{"n_estimators": 10}'');
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''gradient_boosting_trees'',
+ hyperparams => ''{"n_estimators": 10}'');
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''random_forest'',
+ hyperparams => ''{"n_estimators": 10}'');
+
+
+ -- Gradient Boosting
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''xgboost'',
+ hyperparams => ''{"n_estimators": 10}'');
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''xgboost_random_forest'',
+ hyperparams => ''{"n_estimators": 10}'');
+
+ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''lightgbm'',
+ hyperparams => ''{"n_estimators": 1}'');'
+ rendering: "\n
\n \n \n \n
+ \ | project_name | \n \n task | \n
+ \ \n algorithm_name | \n \n status | \n
+ \ \n
\n \n \n \n \n \n | Breast
+ Cancer Detection | \n \n None | \n \n lightgbm | \n
+ \ \n not deployed | \n \n
\n \n \n
\n
\n"
+ execution_time: '00:00:03.250016'
+ cell_number: 14
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 74
+ fields:
+ notebook: 2
+ cell_type: 1
+ contents: "Turns out, computers are pretty fast these days, even with state of
+ the art algorithms running on a free tier computation resources. \U0001F60A
+ \n\nYou can pop over to the [projects](/projects) tab for a visualization of
+ the performance of all these algorithms on this dataset, or you can check out
+ the artifacts directly in the database."
+ rendering: "Turns out, computers are pretty
+ fast these days, even with state of the art algorithms running on a free tier
+ computation resources. \U0001F60A
\nYou can pop over to the projects
+ tab for a visualization of the performance of all these algorithms on this dataset,
+ or you can check out the artifacts directly in the database.
"
+ execution_time: null
+ cell_number: 15
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 75
+ fields:
+ notebook: 2
+ cell_type: 3
+ contents: "SELECT \n projects.name,\n models.algorithm_name,\n round((models.metrics->>'f1')::numeric,
+ 4) AS f1_score,\n round((models.metrics->>'precision')::numeric, 4) AS precision,\n
+ \ round((models.metrics->>'recall')::numeric, 4) AS recall\nFROM pgml.models\nJOIN
+ pgml.projects on projects.id = models.project_id\n AND projects.name = 'Breast
+ Cancer Detection'\nORDER BY models.metrics->>'f1' DESC LIMIT 5;"
+ rendering: "\n
\n \n \n \n
+ \ | name | \n \n algorithm_name | \n
+ \ \n f1_score | \n \n precision | \n
+ \ \n recall | \n \n
\n \n
+ \ \n \n \n \n | Breast Cancer Detection | \n
+ \ \n xgboost | \n \n 0.9860 | \n \n 0.9863 | \n
+ \ \n 0.9860 | \n \n
\n \n \n \n | Breast
+ Cancer Detection | \n \n random_forest | \n \n 0.9860 | \n
+ \ \n 0.9863 | \n \n 0.9860 | \n \n
\n
+ \ \n \n \n | Breast Cancer Detection | \n \n xgboost_random_forest | \n
+ \ \n 0.9790 | \n \n 0.9791 | \n \n 0.9790 | \n
+ \ \n
\n \n \n \n | Breast Cancer Detection | \n
+ \ \n gradient_boosting_trees | \n \n 0.9790 | \n
+ \ \n 0.9791 | \n \n 0.9790 | \n \n
\n
+ \ \n \n \n | Breast Cancer Detection | \n \n ridge | \n
+ \ \n 0.9789 | \n \n 0.9797 | \n \n 0.9790 | \n
+ \ \n
\n \n \n
\n
\n"
+ execution_time: '00:00:00.002094'
+ cell_number: 16
+ version: 1
+ deleted_at: null
+- model: notebooks.notebookcell
+ pk: 76
+ fields:
+ notebook: 2
+ cell_type: 1
+ contents: Tree based algorithms like `random_forest`, `xgboost` and `lightgbm`
+ do well on tabular datasets and frequently lead the pack with A+ level performance
+ as measured by the `f1_score`. They are generally sensitive to small changes
+ in the inputs, but also robust to outliers. They are also relatively fast algorithms
+ that can perform predictions in sub millisecond times, meaning most of the cost
+ of inference is in fetching the data they require as inputs. When your inputs
+ are already in the database with the model, that time is as fast as possible!
+ rendering: Tree based algorithms like random_forest,
+ xgboost and lightgbm do well on tabular datasets and
+ frequently lead the pack with A+ level performance as measured by the f1_score.
+ They are generally sensitive to small changes in the inputs, but also robust
+ to outliers. They are also relatively fast algorithms that can perform predictions
+ in sub millisecond times, meaning most of the cost of inference is in fetching
+ the data they require as inputs. When your inputs are already in the database
+ with the model, that time is as fast as possible!
+ execution_time: null
+ cell_number: 17
+ version: 1
+ deleted_at: null