From 32d46f9f8730d07538bb02a01062157c869d4119 Mon Sep 17 00:00:00 2001
From: Montana Low <montanalow@gmail.com>
Date: Fri, 19 Aug 2022 18:32:37 -0700
Subject: [PATCH] add binary classification notebook

---
 .../fixtures/{fraud.yml => notebooks.yml}     | 1045 +++++++++++++----
 1 file changed, 793 insertions(+), 252 deletions(-)
 rename pgml-dashboard/notebooks/fixtures/{fraud.yml => notebooks.yml} (51%)
diff --git a/pgml-dashboard/notebooks/fixtures/fraud.yml b/pgml-dashboard/notebooks/fixtures/notebooks.yml
similarity index 51%
rename from pgml-dashboard/notebooks/fixtures/fraud.yml
rename to pgml-dashboard/notebooks/fixtures/notebooks.yml
index a1b4c25fc..eb748ba40 100644
--- a/pgml-dashboard/notebooks/fixtures/fraud.yml
+++ b/pgml-dashboard/notebooks/fixtures/notebooks.yml
@@ -4,173 +4,14 @@
     name: 'Tutorial 1: Real Time Fraud Detection'
     created_at: 2022-08-15 22:26:18.428227+00:00
     updated_at: 2022-08-15 22:26:18.428241+00:00
-- model: notebooks.notebookcell
-  pk: 12
-  fields:
-    notebook: 1
-    cell_type: 1
-    contents: ''
-    rendering: <article class="markdown-body"></article>
-    execution_time: null
-    cell_number: 4
-    version: 1
-    deleted_at: 2022-08-16 21:32:21.265114+00:00
-- model: notebooks.notebookcell
-  pk: 13
-  fields:
-    notebook: 1
-    cell_type: 1
-    contents: asdf
-    rendering: <article class="markdown-body"><p>asdf</p></article>
-    execution_time: null
-    cell_number: 5
-    version: 1
-    deleted_at: 2022-08-16 21:32:23.210172+00:00
-- model: notebooks.notebookcell
-  pk: 14
-  fields:
-    notebook: 1
-    cell_type: 3
-    contents: asfd
-    rendering: null
-    execution_time: null
-    cell_number: 6
-    version: 1
-    deleted_at: 2022-08-16 21:32:25.059892+00:00
-- model: notebooks.notebookcell
-  pk: 16
-  fields:
-    notebook: 1
-    cell_type: 1
-    contents: asdf
-    rendering: <article class="markdown-body"><p>asdf</p></article>
-    execution_time: null
-    cell_number: 8
-    version: 1
-    deleted_at: 2022-08-16 21:32:27.489903+00:00
-- model: notebooks.notebookcell
-  pk: 18
-  fields:
-    notebook: 1
-    cell_type: 1
-    contents: asdf
-    rendering: <article class="markdown-body"><p>asdf</p></article>
-    execution_time: null
-    cell_number: 9
-    version: 2
-    deleted_at: 2022-08-16 21:32:26.408082+00:00
-- model: notebooks.notebookcell
-  pk: 19
-  fields:
-    notebook: 1
-    cell_type: 1
-    contents: ''
-    rendering: <article class="markdown-body"></article>
-    execution_time: null
-    cell_number: 10
-    version: 1
-    deleted_at: 2022-08-16 21:32:29.396631+00:00
-- model: notebooks.notebookcell
-  pk: 23
-  fields:
-    notebook: 1
-    cell_type: 1
-    contents: test
-    rendering: <article class="markdown-body"><p>test</p></article>
-    execution_time: null
-    cell_number: 12
-    version: 1
-    deleted_at: 2022-08-16 21:32:39.748416+00:00
-- model: notebooks.notebookcell
-  pk: 24
-  fields:
-    notebook: 1
-    cell_type: 1
-    contents: ''
-    rendering: <article class="markdown-body"></article>
-    execution_time: null
-    cell_number: 13
-    version: 1
-    deleted_at: 2022-08-16 21:32:42.950765+00:00
-- model: notebooks.notebookcell
-  pk: 25
-  fields:
-    notebook: 1
-    cell_type: 3
-    contents: ''
-    rendering: null
-    execution_time: null
-    cell_number: 14
-    version: 1
-    deleted_at: 2022-08-16 21:32:43.832970+00:00
-- model: notebooks.notebookcell
-  pk: 26
-  fields:
-    notebook: 1
-    cell_type: 1
-    contents: asdf
-    rendering: <article class="markdown-body"><p>asdf</p></article>
-    execution_time: null
-    cell_number: 7
-    version: 2
-    deleted_at: 2022-08-16 21:32:25.873904+00:00
-- model: notebooks.notebookcell
-  pk: 29
-  fields:
-    notebook: 1
-    cell_type: 3
-    contents: select 1
-    rendering: null
-    execution_time: null
-    cell_number: 11
-    version: 4
-    deleted_at: 2022-08-16 21:32:39.009549+00:00
-- model: notebooks.notebookcell
-  pk: 31
-  fields:
-    notebook: 1
-    cell_type: 3
-    contents: "INSERT INTO PRODUCTS (emoji, name, price, perishable) \nVALUES\n  ('\U0001F4B0',
-      '1oz gold bar', '$1999.99', false),\n  ('\U0001F4D5', 'a tale of 2 cities',
-      '$19.99', false),\n  ('\U0001F96C', 'head of lettuce', '$1.99', true)\nRETURNING
-      *;"
-    rendering: null
-    execution_time: null
-    cell_number: 3
-    version: 5
-    deleted_at: null
-- model: notebooks.notebookcell
-  pk: 33
-  fields:
-    notebook: 1
-    cell_type: 1
-    contents: "Now that we're in business, our first customer has shown up, named
-      Alice. She is a chef that owns a salad shop, so she is going to create an order
-      for 1,000 \U0001F96C `head of lettuce`.\n\nOur ecommerce site will record `orders`
-      and their `line_items` in our database with the following schema."
-    rendering: "<article class=\"markdown-body\"><p>Now that we're in business, our
-      first customer has shown up, named Alice. She is a chef that owns a salad shop,
-      so she is going to create an order for 1,000 \U0001F96C <code>head of lettuce</code>.</p>\n<p>Our
-      ecommerce site will record <code>orders</code> and their <code>line_items</code>
-      in our database with the following schema.</p></article>"
-    execution_time: null
-    cell_number: 4
-    version: 2
-    deleted_at: null
-- model: notebooks.notebookcell
-  pk: 35
+- model: notebooks.notebook
+  pk: 2
   fields:
-    notebook: 1
-    cell_type: 3
-    contents: "CREATE TABLE products (\n  emoji TEXT PRIMARY KEY,\n  name TEXT,\n
-      \ price MONEY,\n  perishable BOOLEAN\n);"
-    rendering: null
-    execution_time: null
-    cell_number: 2
-    version: 9
-    deleted_at: null
+    name: 'Tutorial 2: Tumor Detection w/ Binary Classification'
+    created_at: 2022-08-19 23:10:23.120983+00:00
+    updated_at: 2022-08-19 23:10:23.120996+00:00
 - model: notebooks.notebookcell
-  pk: 36
+  pk: 1
   fields:
     notebook: 1
     cell_type: 1
@@ -217,6 +58,8 @@
 
       - Part 4: Adding More Features
 
+      - Part 5: Upgrading the Machine Learning Algorithm
+
 
       Part 1: Ecommerce Application Data Model
 
@@ -263,6 +106,8 @@
 
       <li>Part 4: Adding More Features</li>
 
+      <li>Part 5: Upgrading the Machine Learning Algorithm</li>
+
       </ul>
 
       <h2>Part 1: Ecommerce Application Data Model</h2>
@@ -272,21 +117,54 @@
       price, and other metadata, like whether or not they are perishable goods.</p></article>'
     execution_time: null
     cell_number: 1
-    version: 4
+    version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 37
+  pk: 2
+  fields:
+    notebook: 1
+    cell_type: 3
+    contents: "CREATE TABLE products (\n  emoji TEXT PRIMARY KEY,\n  name TEXT,\n
+      \ price MONEY,\n  perishable BOOLEAN\n);"
+    rendering: null
+    execution_time: null
+    cell_number: 2
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 3
+  fields:
+    notebook: 1
+    cell_type: 3
+    contents: "INSERT INTO PRODUCTS (emoji, name, price, perishable) \nVALUES\n  ('\U0001F4B0',
+      '1oz gold bar', '$1999.99', false),\n  ('\U0001F4D5', 'a tale of 2 cities',
+      '$19.99', false),\n  ('\U0001F96C', 'head of lettuce', '$1.99', true)\nRETURNING
+      *;"
+    rendering: null
+    execution_time: null
+    cell_number: 3
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 4
   fields:
     notebook: 1
     cell_type: 1
-    contents: ters
-    rendering: <article class="markdown-body"><p>ters</p></article>
+    contents: "Now that we're in business, our first customer has shown up, named
+      Alice. She is a chef that owns a salad shop, so she is going to create an order
+      for 1,000 \U0001F96C `head of lettuce`.\n\nOur ecommerce site will record `orders`
+      and their `line_items` in our database with the following schema."
+    rendering: "<article class=\"markdown-body\"><p>Now that we're in business, our
+      first customer has shown up, named Alice. She is a chef that owns a salad shop,
+      so she is going to create an order for 1,000 \U0001F96C <code>head of lettuce</code>.</p>\n<p>Our
+      ecommerce site will record <code>orders</code> and their <code>line_items</code>
+      in our database with the following schema.</p></article>"
     execution_time: null
-    cell_number: 5
+    cell_number: 4
     version: 1
-    deleted_at: 2022-08-18 15:01:02.379644+00:00
+    deleted_at: null
 - model: notebooks.notebookcell
-  pk: 38
+  pk: 5
   fields:
     notebook: 1
     cell_type: 3
@@ -299,7 +177,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 39
+  pk: 6
   fields:
     notebook: 1
     cell_type: 1
@@ -311,7 +189,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 40
+  pk: 7
   fields:
     notebook: 1
     cell_type: 3
@@ -325,7 +203,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 41
+  pk: 8
   fields:
     notebook: 1
     cell_type: 1
@@ -347,7 +225,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 42
+  pk: 9
   fields:
     notebook: 1
     cell_type: 3
@@ -359,7 +237,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 43
+  pk: 10
   fields:
     notebook: 1
     cell_type: 1
@@ -372,7 +250,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 44
+  pk: 11
   fields:
     notebook: 1
     cell_type: 3
@@ -388,7 +266,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 45
+  pk: 12
   fields:
     notebook: 1
     cell_type: 1
@@ -404,7 +282,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 46
+  pk: 13
   fields:
     notebook: 1
     cell_type: 3
@@ -417,7 +295,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 47
+  pk: 14
   fields:
     notebook: 1
     cell_type: 1
@@ -430,7 +308,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 48
+  pk: 15
   fields:
     notebook: 1
     cell_type: 3
@@ -446,7 +324,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 49
+  pk: 16
   fields:
     notebook: 1
     cell_type: 1
@@ -461,7 +339,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 50
+  pk: 17
   fields:
     notebook: 1
     cell_type: 3
@@ -473,7 +351,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 51
+  pk: 18
   fields:
     notebook: 1
     cell_type: 1
@@ -485,7 +363,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 52
+  pk: 19
   fields:
     notebook: 1
     cell_type: 3
@@ -498,7 +376,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 53
+  pk: 20
   fields:
     notebook: 1
     cell_type: 1
@@ -523,7 +401,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 54
+  pk: 21
   fields:
     notebook: 1
     cell_type: 3
@@ -540,7 +418,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 55
+  pk: 22
   fields:
     notebook: 1
     cell_type: 1
@@ -551,7 +429,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 56
+  pk: 23
   fields:
     notebook: 1
     cell_type: 3
@@ -562,7 +440,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 57
+  pk: 24
   fields:
     notebook: 1
     cell_type: 1
@@ -583,7 +461,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 58
+  pk: 25
   fields:
     notebook: 1
     cell_type: 3
@@ -600,7 +478,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 59
+  pk: 26
   fields:
     notebook: 1
     cell_type: 3
@@ -611,7 +489,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 60
+  pk: 27
   fields:
     notebook: 1
     cell_type: 1
@@ -642,7 +520,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 61
+  pk: 28
   fields:
     notebook: 1
     cell_type: 3
@@ -658,7 +536,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 62
+  pk: 29
   fields:
     notebook: 1
     cell_type: 1
@@ -705,7 +583,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 63
+  pk: 30
   fields:
     notebook: 1
     cell_type: 3
@@ -719,7 +597,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 64
+  pk: 31
   fields:
     notebook: 1
     cell_type: 1
@@ -731,7 +609,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 65
+  pk: 32
   fields:
     notebook: 1
     cell_type: 3
@@ -747,7 +625,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 66
+  pk: 33
   fields:
     notebook: 1
     cell_type: 1
@@ -759,7 +637,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 67
+  pk: 34
   fields:
     notebook: 1
     cell_type: 3
@@ -772,7 +650,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 68
+  pk: 35
   fields:
     notebook: 1
     cell_type: 1
@@ -783,7 +661,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 69
+  pk: 36
   fields:
     notebook: 1
     cell_type: 3
@@ -799,7 +677,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 70
+  pk: 37
   fields:
     notebook: 1
     cell_type: 1
@@ -812,7 +690,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 71
+  pk: 38
   fields:
     notebook: 1
     cell_type: 3
@@ -825,7 +703,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 72
+  pk: 39
   fields:
     notebook: 1
     cell_type: 1
@@ -837,7 +715,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 73
+  pk: 40
   fields:
     notebook: 1
     cell_type: 3
@@ -853,7 +731,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 74
+  pk: 41
   fields:
     notebook: 1
     cell_type: 1
@@ -866,7 +744,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 75
+  pk: 42
   fields:
     notebook: 1
     cell_type: 3
@@ -879,7 +757,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 76
+  pk: 43
   fields:
     notebook: 1
     cell_type: 1
@@ -896,7 +774,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 77
+  pk: 44
   fields:
     notebook: 1
     cell_type: 3
@@ -910,7 +788,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 78
+  pk: 45
   fields:
     notebook: 1
     cell_type: 1
@@ -940,7 +818,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 79
+  pk: 46
   fields:
     notebook: 1
     cell_type: 3
@@ -959,7 +837,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 80
+  pk: 47
   fields:
     notebook: 1
     cell_type: 1
@@ -972,7 +850,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 81
+  pk: 48
   fields:
     notebook: 1
     cell_type: 3
@@ -980,15 +858,15 @@
       -- a friendly name we'll use to identify this machine learning project\n  task
       => 'classification', -- we want to classify into true or false\n  relation_name
       => 'fraud_samples', -- our view of the data\n  y_column_name => 'fraudulent',
-      -- the \"labels\"\n  test_size => 0.5 -- use half the data for testing rather
-      than the default test size of 25%\n);"
+      -- the \"labels\"\n  test_sampling => 'last',\n  test_size => 0.5 -- use half
+      the data for testing rather than the default test size of 25%\n);"
     rendering: null
     execution_time: null
     cell_number: 48
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 82
+  pk: 49
   fields:
     notebook: 1
     cell_type: 1
@@ -1000,7 +878,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 83
+  pk: 50
   fields:
     notebook: 1
     cell_type: 3
@@ -1011,7 +889,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 84
+  pk: 51
   fields:
     notebook: 1
     cell_type: 1
@@ -1023,7 +901,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 85
+  pk: 52
   fields:
     notebook: 1
     cell_type: 3
@@ -1036,7 +914,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 86
+  pk: 53
   fields:
     notebook: 1
     cell_type: 1
@@ -1095,29 +973,36 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 87
+  pk: 54
   fields:
     notebook: 1
     cell_type: 1
-    contents: When you're out of ideas for features that might help the model distinguish
+    contents: 'Part 5: Upgrading the Machine Learning Algorithm
+
+      ------------------------------------------
+
+
+      When you''re out of ideas for features that might help the model distinguish
       orders that are likely to result in chargebacks, you may want to start testing
       different algorithms to see how the performance changes. PostgresML makes algorithm
       selection as easy as passing an additional parameter to `pgml.train`. You may
       want to test them all just to see, but `xgboost` typically gives excellent performance
-      in terms of both accuracy and latency.
-    rendering: <article class="markdown-body"><p>When you're out of ideas for features
-      that might help the model distinguish orders that are likely to result in chargebacks,
-      you may want to start testing different algorithms to see how the performance
-      changes. PostgresML makes algorithm selection as easy as passing an additional
-      parameter to <code>pgml.train</code>. You may want to test them all just to
-      see, but <code>xgboost</code> typically gives excellent performance in terms
-      of both accuracy and latency.</p></article>
+      in terms of both accuracy and latency.'
+    rendering: '<article class="markdown-body"><h2>Part 5: Upgrading the Machine Learning
+      Algorithm</h2>
+
+      <p>When you''re out of ideas for features that might help the model distinguish
+      orders that are likely to result in chargebacks, you may want to start testing
+      different algorithms to see how the performance changes. PostgresML makes algorithm
+      selection as easy as passing an additional parameter to <code>pgml.train</code>.
+      You may want to test them all just to see, but <code>xgboost</code> typically
+      gives excellent performance in terms of both accuracy and latency.</p></article>'
     execution_time: null
     cell_number: 54
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 88
+  pk: 55
   fields:
     notebook: 1
     cell_type: 3
@@ -1134,33 +1019,26 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 89
+  pk: 56
   fields:
     notebook: 1
     cell_type: 1
-    contents: 'Using Regression instead of Classificaiton
-
-      ------------------------------------------
-
-
-      So far we''ve been training a classifier that gives us a binary 0 or 1 output
-      to indicate likely fraud or not. If we''d like to refine our application response
+    contents: So far we've been training a classifier that gives us a binary 0 or
+      1 output to indicate fraud or not. If we'd like to refine our application response
       to the models predictions in a more nuanced way, say high/medium/low risk instead
       of binary, we can use "regression" instead of "classification" to predict a
-      likelihood between 0 and 1, instead of binary.'
-    rendering: '<article class="markdown-body"><h2>Using Regression instead of Classificaiton</h2>
-
-      <p>So far we''ve been training a classifier that gives us a binary 0 or 1 output
-      to indicate likely fraud or not. If we''d like to refine our application response
-      to the models predictions in a more nuanced way, say high/medium/low risk instead
-      of binary, we can use "regression" instead of "classification" to predict a
-      likelihood between 0 and 1, instead of binary.</p></article>'
+      likelihood between 0 and 1, instead of binary.
+    rendering: <article class="markdown-body"><p>So far we've been training a classifier
+      that gives us a binary 0 or 1 output to indicate fraud or not. If we'd like
+      to refine our application response to the models predictions in a more nuanced
+      way, say high/medium/low risk instead of binary, we can use "regression" instead
+      of "classification" to predict a likelihood between 0 and 1, instead of binary.</p></article>
     execution_time: null
     cell_number: 56
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 90
+  pk: 57
   fields:
     notebook: 1
     cell_type: 3
@@ -1176,7 +1054,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 91
+  pk: 58
   fields:
     notebook: 1
     cell_type: 1
@@ -1195,7 +1073,7 @@
     version: 1
     deleted_at: null
 - model: notebooks.notebookcell
-  pk: 92
+  pk: 59
   fields:
     notebook: 1
     cell_type: 3
@@ -1209,3 +1087,666 @@
     cell_number: 59
     version: 1
     deleted_at: null
+- model: notebooks.notebookcell
+  pk: 60
+  fields:
+    notebook: 2
+    cell_type: 1
+    contents: 'Binary classification means categorizing data into 2 categories. Usually
+      these are categories like:
+
+
+      - `True` or `False`
+
+      - `0` or `1`
+
+      - `hot_dog` or `not_hot_dog`
+
+
+      These categories divide a population into things we care about, and things we
+      can ignore. Binary classification is a common task for machine learning models.
+      It can be applied across a broad set of scenarios, once you understand the way
+      to structure your problem as a set of example data with labeled outcomes.
+
+
+      In this tutorial, we''ll train models using various "supervised learning" algorithms
+      to classify medical samples as benign or malignant. Supervised learning techniques
+      require us to label the sample data for the algorithm to learn how the inputs
+      correlate with the labels. After the algorithm has been trained on the labeled
+      data set we created, we can present it with new unlabeled data to classify based
+      on the most likely outcome.
+
+
+      As we saw in [Tutorial 1: Real Time Fraud Model](/notebooks/notebook/1) understanding
+      the structure of the data and the labels is a complex and critical step for
+      real world machine learning projects. In this example we''ll focus more on the
+      different algorithms, and use an academic benchmark dataset that already includes
+      binary labels from UCI ML Breast Cancer Wisconsin. Features were computed from
+      a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe
+      characteristics of the cell nuclei present in the image. The labels are either
+      True for a malginant sample of False for a benign sample.
+
+
+      You can load this dataset into your Postgres database with the following SQL.'
+    rendering: '<article class="markdown-body"><p>Binary classification means categorizing
+      data into 2 categories. Usually these are categories like:</p>
+
+      <ul>
+
+      <li><code>True</code> or <code>False</code></li>
+
+      <li><code>0</code> or <code>1</code></li>
+
+      <li><code>hot_dog</code> or <code>not_hot_dog</code></li>
+
+      </ul>
+
+      <p>These categories divide a population into things we care about, and things
+      we can ignore. Binary classification is a common task for machine learning models.
+      It can be applied across a broad set of scenarios, once you understand the way
+      to structure your problem as a set of example data with labeled outcomes.</p>
+
+      <p>In this tutorial, we''ll train models using various "supervised learning"
+      algorithms to classify medical samples as benign or malignant. Supervised learning
+      techniques require us to label the sample data for the algorithm to learn how
+      the inputs correlate with the labels. After the algorithm has been trained on
+      the labeled data set we created, we can present it with new unlabeled data to
+      classify based on the most likely outcome.</p>
+
+      <p>As we saw in <a href="/notebooks/notebook/1">Tutorial 1: Real Time Fraud
+      Model</a> understanding the structure of the data and the labels is a complex
+      and critical step for real world machine learning projects. In this example
+      we''ll focus more on the different algorithms, and use an academic benchmark
+      dataset that already includes binary labels from UCI ML Breast Cancer Wisconsin.
+      Features were computed from a digitized image of a fine needle aspirate (FNA)
+      of a breast mass. They describe characteristics of the cell nuclei present in
+      the image. The labels are either True for a malginant sample of False for a
+      benign sample.</p>
+
+      <p>You can load this dataset into your Postgres database with the following
+      SQL.</p></article>'
+    execution_time: null
+    cell_number: 1
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 61
+  fields:
+    notebook: 2
+    cell_type: 3
+    contents: SELECT pgml.load_dataset('breast_cancer');
+    rendering: "<div class=\"markdown-body\">\n<table>\n  <thead>\n    <tr>\n      \n
+      \     <td><strong>load_dataset</strong></td>\n      \n    </tr>\n  </thead>\n
+      \ <tbody>\n    \n    <tr>\n      \n      <td>OK</td>\n      \n    </tr>\n    \n
+      \ </tbody>\n</table>\n</div>\n"
+    execution_time: '00:00:02.498819'
+    cell_number: 2
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 62
+  fields:
+    notebook: 2
+    cell_type: 1
+    contents: This function has created a new table in your database named `pgml.breast_cancer`.
+      Let's look at a random sample of the data with some more SQL.
+    rendering: <article class="markdown-body"><p>This function has created a new table
+      in your database named <code>pgml.breast_cancer</code>. Let's look at a random
+      sample of the data with some more SQL.</p></article>
+    execution_time: null
+    cell_number: 3
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 63
+  fields:
+    notebook: 2
+    cell_type: 3
+    contents: "SELECT * \nFROM pgml.breast_cancer \nORDER BY random()\nLIMIT 10;"
+    rendering: "<div class=\"markdown-body\">\n<table>\n  <thead>\n    <tr>\n      \n
+      \     <td><strong>mean radius</strong></td>\n      \n      <td><strong>mean
+      texture</strong></td>\n      \n      <td><strong>mean perimeter</strong></td>\n
+      \     \n      <td><strong>mean area</strong></td>\n      \n      <td><strong>mean
+      smoothness</strong></td>\n      \n      <td><strong>mean compactness</strong></td>\n
+      \     \n      <td><strong>mean concavity</strong></td>\n      \n      <td><strong>mean
+      concave points</strong></td>\n      \n      <td><strong>mean symmetry</strong></td>\n
+      \     \n      <td><strong>mean fractal dimension</strong></td>\n      \n      <td><strong>radius
+      error</strong></td>\n      \n      <td><strong>texture error</strong></td>\n
+      \     \n      <td><strong>perimeter error</strong></td>\n      \n      <td><strong>area
+      error</strong></td>\n      \n      <td><strong>smoothness error</strong></td>\n
+      \     \n      <td><strong>compactness error</strong></td>\n      \n      <td><strong>concavity
+      error</strong></td>\n      \n      <td><strong>concave points error</strong></td>\n
+      \     \n      <td><strong>symmetry error</strong></td>\n      \n      <td><strong>fractal
+      dimension error</strong></td>\n      \n      <td><strong>worst radius</strong></td>\n
+      \     \n      <td><strong>worst texture</strong></td>\n      \n      <td><strong>worst
+      perimeter</strong></td>\n      \n      <td><strong>worst area</strong></td>\n
+      \     \n      <td><strong>worst smoothness</strong></td>\n      \n      <td><strong>worst
+      compactness</strong></td>\n      \n      <td><strong>worst concavity</strong></td>\n
+      \     \n      <td><strong>worst concave points</strong></td>\n      \n      <td><strong>worst
+      symmetry</strong></td>\n      \n      <td><strong>worst fractal dimension</strong></td>\n
+      \     \n      <td><strong>malignant</strong></td>\n      \n    </tr>\n  </thead>\n
+      \ <tbody>\n    \n    <tr>\n      \n      <td>12.77</td>\n      \n      <td>21.41</td>\n
+      \     \n      <td>82.02</td>\n      \n      <td>507.4</td>\n      \n      <td>0.08749</td>\n
+      \     \n      <td>0.06601</td>\n      \n      <td>0.03112</td>\n      \n      <td>0.02864</td>\n
+      \     \n      <td>0.1694</td>\n      \n      <td>0.06287</td>\n      \n      <td>0.7311</td>\n
+      \     \n      <td>1.748</td>\n      \n      <td>5.118</td>\n      \n      <td>53.65</td>\n
+      \     \n      <td>0.004571</td>\n      \n      <td>0.0179</td>\n      \n      <td>0.02176</td>\n
+      \     \n      <td>0.01757</td>\n      \n      <td>0.03373</td>\n      \n      <td>0.005875</td>\n
+      \     \n      <td>13.75</td>\n      \n      <td>23.5</td>\n      \n      <td>89.04</td>\n
+      \     \n      <td>579.5</td>\n      \n      <td>0.09388</td>\n      \n      <td>0.08978</td>\n
+      \     \n      <td>0.05186</td>\n      \n      <td>0.04773</td>\n      \n      <td>0.2179</td>\n
+      \     \n      <td>0.06871</td>\n      \n      <td>False</td>\n      \n    </tr>\n
+      \   \n    <tr>\n      \n      <td>12.22</td>\n      \n      <td>20.04</td>\n
+      \     \n      <td>79.47</td>\n      \n      <td>453.1</td>\n      \n      <td>0.1096</td>\n
+      \     \n      <td>0.1152</td>\n      \n      <td>0.08175</td>\n      \n      <td>0.02166</td>\n
+      \     \n      <td>0.2124</td>\n      \n      <td>0.06894</td>\n      \n      <td>0.1811</td>\n
+      \     \n      <td>0.7959</td>\n      \n      <td>0.9857</td>\n      \n      <td>12.58</td>\n
+      \     \n      <td>0.006272</td>\n      \n      <td>0.02198</td>\n      \n      <td>0.03966</td>\n
+      \     \n      <td>0.009894</td>\n      \n      <td>0.0132</td>\n      \n      <td>0.003813</td>\n
+      \     \n      <td>13.16</td>\n      \n      <td>24.17</td>\n      \n      <td>85.13</td>\n
+      \     \n      <td>515.3</td>\n      \n      <td>0.1402</td>\n      \n      <td>0.2315</td>\n
+      \     \n      <td>0.3535</td>\n      \n      <td>0.08088</td>\n      \n      <td>0.2709</td>\n
+      \     \n      <td>0.08839</td>\n      \n      <td>False</td>\n      \n    </tr>\n
+      \   \n    <tr>\n      \n      <td>12.4</td>\n      \n      <td>17.68</td>\n
+      \     \n      <td>81.47</td>\n      \n      <td>467.8</td>\n      \n      <td>0.1054</td>\n
+      \     \n      <td>0.1316</td>\n      \n      <td>0.07741</td>\n      \n      <td>0.02799</td>\n
+      \     \n      <td>0.1811</td>\n      \n      <td>0.07102</td>\n      \n      <td>0.1767</td>\n
+      \     \n      <td>1.46</td>\n      \n      <td>2.204</td>\n      \n      <td>15.43</td>\n
+      \     \n      <td>0.01</td>\n      \n      <td>0.03295</td>\n      \n      <td>0.04861</td>\n
+      \     \n      <td>0.01167</td>\n      \n      <td>0.02187</td>\n      \n      <td>0.006005</td>\n
+      \     \n      <td>12.88</td>\n      \n      <td>22.91</td>\n      \n      <td>89.61</td>\n
+      \     \n      <td>515.8</td>\n      \n      <td>0.145</td>\n      \n      <td>0.2629</td>\n
+      \     \n      <td>0.2403</td>\n      \n      <td>0.0737</td>\n      \n      <td>0.2556</td>\n
+      \     \n      <td>0.09359</td>\n      \n      <td>False</td>\n      \n    </tr>\n
+      \   \n    <tr>\n      \n      <td>14.02</td>\n      \n      <td>15.66</td>\n
+      \     \n      <td>89.59</td>\n      \n      <td>606.5</td>\n      \n      <td>0.07966</td>\n
+      \     \n      <td>0.05581</td>\n      \n      <td>0.02087</td>\n      \n      <td>0.02652</td>\n
+      \     \n      <td>0.1589</td>\n      \n      <td>0.05586</td>\n      \n      <td>0.2142</td>\n
+      \     \n      <td>0.6549</td>\n      \n      <td>1.606</td>\n      \n      <td>19.25</td>\n
+      \     \n      <td>0.004837</td>\n      \n      <td>0.009238</td>\n      \n      <td>0.009213</td>\n
+      \     \n      <td>0.01076</td>\n      \n      <td>0.01171</td>\n      \n      <td>0.002104</td>\n
+      \     \n      <td>14.91</td>\n      \n      <td>19.31</td>\n      \n      <td>96.53</td>\n
+      \     \n      <td>688.9</td>\n      \n      <td>0.1034</td>\n      \n      <td>0.1017</td>\n
+      \     \n      <td>0.0626</td>\n      \n      <td>0.08216</td>\n      \n      <td>0.2136</td>\n
+      \     \n      <td>0.0671</td>\n      \n      <td>False</td>\n      \n    </tr>\n
+      \   \n    <tr>\n      \n      <td>19.59</td>\n      \n      <td>18.15</td>\n
+      \     \n      <td>130.7</td>\n      \n      <td>1214.0</td>\n      \n      <td>0.112</td>\n
+      \     \n      <td>0.1666</td>\n      \n      <td>0.2508</td>\n      \n      <td>0.1286</td>\n
+      \     \n      <td>0.2027</td>\n      \n      <td>0.06082</td>\n      \n      <td>0.7364</td>\n
+      \     \n      <td>1.048</td>\n      \n      <td>4.792</td>\n      \n      <td>97.07</td>\n
+      \     \n      <td>0.004057</td>\n      \n      <td>0.02277</td>\n      \n      <td>0.04029</td>\n
+      \     \n      <td>0.01303</td>\n      \n      <td>0.01686</td>\n      \n      <td>0.003318</td>\n
+      \     \n      <td>26.73</td>\n      \n      <td>26.39</td>\n      \n      <td>174.9</td>\n
+      \     \n      <td>2232.0</td>\n      \n      <td>0.1438</td>\n      \n      <td>0.3846</td>\n
+      \     \n      <td>0.681</td>\n      \n      <td>0.2247</td>\n      \n      <td>0.3643</td>\n
+      \     \n      <td>0.09223</td>\n      \n      <td>True</td>\n      \n    </tr>\n
+      \   \n    <tr>\n      \n      <td>8.726</td>\n      \n      <td>15.83</td>\n
+      \     \n      <td>55.84</td>\n      \n      <td>230.9</td>\n      \n      <td>0.115</td>\n
+      \     \n      <td>0.08201</td>\n      \n      <td>0.04132</td>\n      \n      <td>0.01924</td>\n
+      \     \n      <td>0.1649</td>\n      \n      <td>0.07633</td>\n      \n      <td>0.1665</td>\n
+      \     \n      <td>0.5864</td>\n      \n      <td>1.354</td>\n      \n      <td>8.966</td>\n
+      \     \n      <td>0.008261</td>\n      \n      <td>0.02213</td>\n      \n      <td>0.03259</td>\n
+      \     \n      <td>0.0104</td>\n      \n      <td>0.01708</td>\n      \n      <td>0.003806</td>\n
+      \     \n      <td>9.628</td>\n      \n      <td>19.62</td>\n      \n      <td>64.48</td>\n
+      \     \n      <td>284.4</td>\n      \n      <td>0.1724</td>\n      \n      <td>0.2364</td>\n
+      \     \n      <td>0.2456</td>\n      \n      <td>0.105</td>\n      \n      <td>0.2926</td>\n
+      \     \n      <td>0.1017</td>\n      \n      <td>False</td>\n      \n    </tr>\n
+      \   \n    <tr>\n      \n      <td>17.99</td>\n      \n      <td>10.38</td>\n
+      \     \n      <td>122.8</td>\n      \n      <td>1001.0</td>\n      \n      <td>0.1184</td>\n
+      \     \n      <td>0.2776</td>\n      \n      <td>0.3001</td>\n      \n      <td>0.1471</td>\n
+      \     \n      <td>0.2419</td>\n      \n      <td>0.07871</td>\n      \n      <td>1.095</td>\n
+      \     \n      <td>0.9053</td>\n      \n      <td>8.589</td>\n      \n      <td>153.4</td>\n
+      \     \n      <td>0.006399</td>\n      \n      <td>0.04904</td>\n      \n      <td>0.05373</td>\n
+      \     \n      <td>0.01587</td>\n      \n      <td>0.03003</td>\n      \n      <td>0.006193</td>\n
+      \     \n      <td>25.38</td>\n      \n      <td>17.33</td>\n      \n      <td>184.6</td>\n
+      \     \n      <td>2019.0</td>\n      \n      <td>0.1622</td>\n      \n      <td>0.6656</td>\n
+      \     \n      <td>0.7119</td>\n      \n      <td>0.2654</td>\n      \n      <td>0.4601</td>\n
+      \     \n      <td>0.1189</td>\n      \n      <td>True</td>\n      \n    </tr>\n
+      \   \n    <tr>\n      \n      <td>11.74</td>\n      \n      <td>14.69</td>\n
+      \     \n      <td>76.31</td>\n      \n      <td>426.0</td>\n      \n      <td>0.08099</td>\n
+      \     \n      <td>0.09661</td>\n      \n      <td>0.06726</td>\n      \n      <td>0.02639</td>\n
+      \     \n      <td>0.1499</td>\n      \n      <td>0.06758</td>\n      \n      <td>0.1924</td>\n
+      \     \n      <td>0.6417</td>\n      \n      <td>1.345</td>\n      \n      <td>13.04</td>\n
+      \     \n      <td>0.006982</td>\n      \n      <td>0.03916</td>\n      \n      <td>0.04017</td>\n
+      \     \n      <td>0.01528</td>\n      \n      <td>0.0226</td>\n      \n      <td>0.006822</td>\n
+      \     \n      <td>12.45</td>\n      \n      <td>17.6</td>\n      \n      <td>81.25</td>\n
+      \     \n      <td>473.8</td>\n      \n      <td>0.1073</td>\n      \n      <td>0.2793</td>\n
+      \     \n      <td>0.269</td>\n      \n      <td>0.1056</td>\n      \n      <td>0.2604</td>\n
+      \     \n      <td>0.09879</td>\n      \n      <td>False</td>\n      \n    </tr>\n
+      \   \n    <tr>\n      \n      <td>9.667</td>\n      \n      <td>18.49</td>\n
+      \     \n      <td>61.49</td>\n      \n      <td>289.1</td>\n      \n      <td>0.08946</td>\n
+      \     \n      <td>0.06258</td>\n      \n      <td>0.02948</td>\n      \n      <td>0.01514</td>\n
+      \     \n      <td>0.2238</td>\n      \n      <td>0.06413</td>\n      \n      <td>0.3776</td>\n
+      \     \n      <td>1.35</td>\n      \n      <td>2.569</td>\n      \n      <td>22.73</td>\n
+      \     \n      <td>0.007501</td>\n      \n      <td>0.01989</td>\n      \n      <td>0.02714</td>\n
+      \     \n      <td>0.009883</td>\n      \n      <td>0.0196</td>\n      \n      <td>0.003913</td>\n
+      \     \n      <td>11.14</td>\n      \n      <td>25.62</td>\n      \n      <td>70.88</td>\n
+      \     \n      <td>385.2</td>\n      \n      <td>0.1234</td>\n      \n      <td>0.1542</td>\n
+      \     \n      <td>0.1277</td>\n      \n      <td>0.0656</td>\n      \n      <td>0.3174</td>\n
+      \     \n      <td>0.08524</td>\n      \n      <td>False</td>\n      \n    </tr>\n
+      \   \n    <tr>\n      \n      <td>13.08</td>\n      \n      <td>15.71</td>\n
+      \     \n      <td>85.63</td>\n      \n      <td>520.0</td>\n      \n      <td>0.1075</td>\n
+      \     \n      <td>0.127</td>\n      \n      <td>0.04568</td>\n      \n      <td>0.0311</td>\n
+      \     \n      <td>0.1967</td>\n      \n      <td>0.06811</td>\n      \n      <td>0.1852</td>\n
+      \     \n      <td>0.7477</td>\n      \n      <td>1.383</td>\n      \n      <td>14.67</td>\n
+      \     \n      <td>0.004097</td>\n      \n      <td>0.01898</td>\n      \n      <td>0.01698</td>\n
+      \     \n      <td>0.00649</td>\n      \n      <td>0.01678</td>\n      \n      <td>0.002425</td>\n
+      \     \n      <td>14.5</td>\n      \n      <td>20.49</td>\n      \n      <td>96.09</td>\n
+      \     \n      <td>630.5</td>\n      \n      <td>0.1312</td>\n      \n      <td>0.2776</td>\n
+      \     \n      <td>0.189</td>\n      \n      <td>0.07283</td>\n      \n      <td>0.3184</td>\n
+      \     \n      <td>0.08183</td>\n      \n      <td>False</td>\n      \n    </tr>\n
+      \   \n  </tbody>\n</table>\n</div>\n"
+    execution_time: '00:00:00.007697'
+    cell_number: 4
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 64
+  fields:
+    notebook: 2
+    cell_type: 1
+    contents: 'That''s a lot of numeric feature data describing various attributes
+      of the cells, but if you scroll all the way to the right above, after running
+      the query, you''ll see that each sample set of feature data is labeled `malignant`
+      [`True` or `False`]. It would be extremely difficult for a human to study all
+      these numbers, and see how they correlate with malignant or not, and then be
+      able to make a prediction for new samples, but mathemeticians have been working
+      on algorithms to do exactly this using computers which happen to be exceptionally
+      good at this by now. This is statistical machine learning.
+
+
+      PostgresML makes it easy to use this data to create a model. It only takes a
+      single function call with a few parameters.'
+    rendering: '<article class="markdown-body"><p>That''s a lot of numeric feature
+      data describing various attributes of the cells, but if you scroll all the way
+      to the right above, after running the query, you''ll see that each sample set
+      of feature data is labeled <code>malignant</code> [<code>True</code> or <code>False</code>].
+      It would be extremely difficult for a human to study all these numbers, and
+      see how they correlate with malignant or not, and then be able to make a prediction
+      for new samples, but mathemeticians have been working on algorithms to do exactly
+      this using computers which happen to be exceptionally good at this by now. This
+      is statistical machine learning.</p>
+
+      <p>PostgresML makes it easy to use this data to create a model. It only takes
+      a single function call with a few parameters.</p></article>'
+    execution_time: null
+    cell_number: 5
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 65
+  fields:
+    notebook: 2
+    cell_type: 3
+    contents: "SELECT * FROM pgml.train(\n  project_name => 'Breast Cancer Detection',
+      \n  task => 'classification', \n  relation_name => 'pgml.breast_cancer', \n
+      \ y_column_name => 'malignant'\n);"
+    rendering: "<div class=\"markdown-body\">\n<table>\n  <thead>\n    <tr>\n      \n
+      \     <td><strong>project_name</strong></td>\n      \n      <td><strong>task</strong></td>\n
+      \     \n      <td><strong>algorithm_name</strong></td>\n      \n      <td><strong>status</strong></td>\n
+      \     \n    </tr>\n  </thead>\n  <tbody>\n    \n    <tr>\n      \n      <td>Breast
+      Cancer Detection</td>\n      \n      <td>classification</td>\n      \n      <td>linear</td>\n
+      \     \n      <td>not deployed</td>\n      \n    </tr>\n    \n  </tbody>\n</table>\n</div>\n"
+    execution_time: '00:00:02.802388'
+    cell_number: 6
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 66
+  fields:
+    notebook: 2
+    cell_type: 1
+    contents: "\U0001F3C1 Congratulations \U0001F3C1\n---------------------\n\nYou've
+      just created a machine learning model, tested it's accuracy, and deployed it
+      to production. PostgresML orchestrated a bunch of the traditional ML drudgery
+      in that couple of seconds to make it as simple as possible for you to get value.
+      We'll organize our work on this task under the project name \"Breast Cancer
+      Detection\", which you can now see it in your [list of projects](/projects/).
+      You can see that the first model uses the default linear algorithm, and that
+      it achieves an [F1 score](https://en.wikipedia.org/wiki/F-score) in the mid
+      90's, which is pretty good. A score of 1.0 is perfect, and 0.5 would be as good
+      as random guessing. The better the F1 score, the better the algorithm can perform
+      on this dataset. \n\nWe can now use this model to make some predictions in real
+      time, using the training data as input to the `pgml.predict` function."
+    rendering: "<article class=\"markdown-body\"><h2>\U0001F3C1 Congratulations \U0001F3C1</h2>\n<p>You've
+      just created a machine learning model, tested it's accuracy, and deployed it
+      to production. PostgresML orchestrated a bunch of the traditional ML drudgery
+      in that couple of seconds to make it as simple as possible for you to get value.
+      We'll organize our work on this task under the project name \"Breast Cancer
+      Detection\", which you can now see it in your <a href=\"/projects/\">list of
+      projects</a>. You can see that the first model uses the default linear algorithm,
+      and that it achieves an <a href=\"https://en.wikipedia.org/wiki/F-score\">F1
+      score</a> in the mid 90's, which is pretty good. A score of 1.0 is perfect,
+      and 0.5 would be as good as random guessing. The better the F1 score, the better
+      the algorithm can perform on this dataset. </p>\n<p>We can now use this model
+      to make some predictions in real time, using the training data as input to the
+      <code>pgml.predict</code> function.</p></article>"
+    execution_time: null
+    cell_number: 7
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 67
+  fields:
+    notebook: 2
+    cell_type: 3
+    contents: "SELECT malignant, pgml.predict(\n    'Breast Cancer Detection', \n
+      \   ARRAY[\n        \"mean radius\", \n        \"mean texture\", \n        \"mean
+      perimeter\", \n        \"mean area\",\n        \"mean smoothness\",\n        \"mean
+      compactness\",\n        \"mean concavity\",\n        \"mean concave points\",\n
+      \       \"mean symmetry\",\n        \"mean fractal dimension\",\n        \"radius
+      error\",\n        \"texture error\",\n        \"perimeter error\",\n        \"area
+      error\",\n        \"smoothness error\",\n        \"compactness error\",\n        \"concavity
+      error\",\n        \"concave points error\",\n        \"symmetry error\",\n        \"fractal
+      dimension error\",\n        \"worst radius\",\n        \"worst texture\",\n
+      \       \"worst perimeter\",\n        \"worst area\",\n        \"worst smoothness\",\n
+      \       \"worst compactness\",\n        \"worst concavity\",\n        \"worst
+      concave points\",\n        \"worst symmetry\",\n        \"worst fractal dimension\"\n
+      \   ]\n) AS prediction\nFROM pgml.breast_cancer\nORDER BY random()\nLIMIT 10;"
+    rendering: "<div class=\"markdown-body\">\n<table>\n  <thead>\n    <tr>\n      \n
+      \     <td><strong>malignant</strong></td>\n      \n      <td><strong>prediction</strong></td>\n
+      \     \n    </tr>\n  </thead>\n  <tbody>\n    \n    <tr>\n      \n      <td>False</td>\n
+      \     \n      <td>0.0</td>\n      \n    </tr>\n    \n    <tr>\n      \n      <td>True</td>\n
+      \     \n      <td>1.0</td>\n      \n    </tr>\n    \n    <tr>\n      \n      <td>False</td>\n
+      \     \n      <td>0.0</td>\n      \n    </tr>\n    \n    <tr>\n      \n      <td>False</td>\n
+      \     \n      <td>0.0</td>\n      \n    </tr>\n    \n    <tr>\n      \n      <td>False</td>\n
+      \     \n      <td>0.0</td>\n      \n    </tr>\n    \n    <tr>\n      \n      <td>False</td>\n
+      \     \n      <td>0.0</td>\n      \n    </tr>\n    \n    <tr>\n      \n      <td>False</td>\n
+      \     \n      <td>0.0</td>\n      \n    </tr>\n    \n    <tr>\n      \n      <td>True</td>\n
+      \     \n      <td>1.0</td>\n      \n    </tr>\n    \n    <tr>\n      \n      <td>False</td>\n
+      \     \n      <td>0.0</td>\n      \n    </tr>\n    \n    <tr>\n      \n      <td>True</td>\n
+      \     \n      <td>1.0</td>\n      \n    </tr>\n    \n  </tbody>\n</table>\n</div>\n"
+    execution_time: '00:00:02.657161'
+    cell_number: 8
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 68
+  fields:
+    notebook: 2
+    cell_type: 1
+    contents: "You can see the model is pretty good at predicting `0` for non malignant
+      samples, and `1` for malignant samples. This isn't a great test though, because
+      we're using the same data we trained with. We could have just looked up the
+      data in the database table if this is all we wanted to do. The point of training
+      a machine learning model, is to generalize these statistics to data we've never
+      seen before. What do you think this model would predict if all the input features
+      happened to be 0 or 1? How does that compare to what it's seen before? \n\nIt's
+      easy to test the model and see by providing new sample data in real time. There
+      are lots of ways we could feed new data to a model in Postgres. We could write
+      new samples to a table just like our training data, or we could pass parameters
+      directly into a query without recording anything in the database at all. Postgres
+      gives us a lot of ways to get data in and out at run time. We'll demonstrate
+      with a `VALUES` example for batch prediction."
+    rendering: '<article class="markdown-body"><p>You can see the model is pretty
+      good at predicting <code>0</code> for non malignant samples, and <code>1</code>
+      for malignant samples. This isn''t a great test though, because we''re using
+      the same data we trained with. We could have just looked up the data in the
+      database table if this is all we wanted to do. The point of training a machine
+      learning model, is to generalize these statistics to data we''ve never seen
+      before. What do you think this model would predict if all the input features
+      happened to be 0 or 1? How does that compare to what it''s seen before? </p>
+
+      <p>It''s easy to test the model and see by providing new sample data in real
+      time. There are lots of ways we could feed new data to a model in Postgres.
+      We could write new samples to a table just like our training data, or we could
+      pass parameters directly into a query without recording anything in the database
+      at all. Postgres gives us a lot of ways to get data in and out at run time.
+      We''ll demonstrate with a <code>VALUES</code> example for batch prediction.</p></article>'
+    execution_time: null
+    cell_number: 9
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 69
+  fields:
+    notebook: 2
+    cell_type: 3
+    contents: "SELECT sample_name, pgml.predict(\n    'Breast Cancer Detection', \n
+      \   ARRAY[\n        \"mean radius\", \n        \"mean texture\", \n        \"mean
+      perimeter\", \n        \"mean area\",\n        \"mean smoothness\",\n        \"mean
+      compactness\",\n        \"mean concavity\",\n        \"mean concave points\",\n
+      \       \"mean symmetry\",\n        \"mean fractal dimension\",\n        \"radius
+      error\",\n        \"texture error\",\n        \"perimeter error\",\n        \"area
+      error\",\n        \"smoothness error\",\n        \"compactness error\",\n        \"concavity
+      error\",\n        \"concave points error\",\n        \"symmetry error\",\n        \"fractal
+      dimension error\",\n        \"worst radius\",\n        \"worst texture\",\n
+      \       \"worst perimeter\",\n        \"worst area\",\n        \"worst smoothness\",\n
+      \       \"worst compactness\",\n        \"worst concavity\",\n        \"worst
+      concave points\",\n        \"worst symmetry\",\n        \"worst fractal dimension\"\n
+      \   ]\n) AS prediction\nFROM (\n  VALUES \n  \t('all_zeroes',0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),\n
+      \ \t('all_ones',  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)\n)
+      \n  AS t (\n    \"sample_name\",\n    \"mean radius\", \n    \"mean texture\",
+      \n    \"mean perimeter\", \n    \"mean area\",\n    \"mean smoothness\",\n    \"mean
+      compactness\",\n    \"mean concavity\",\n    \"mean concave points\",\n    \"mean
+      symmetry\",\n    \"mean fractal dimension\",\n    \"radius error\",\n    \"texture
+      error\",\n    \"perimeter error\",\n    \"area error\",\n    \"smoothness error\",\n
+      \   \"compactness error\",\n    \"concavity error\",\n    \"concave points error\",\n
+      \   \"symmetry error\",\n    \"fractal dimension error\",\n    \"worst radius\",\n
+      \   \"worst texture\",\n    \"worst perimeter\",\n    \"worst area\",\n    \"worst
+      smoothness\",\n    \"worst compactness\",\n    \"worst concavity\",\n    \"worst
+      concave points\",\n    \"worst symmetry\",\n    \"worst fractal dimension\"\n
+      \ );"
+    rendering: "<div class=\"markdown-body\">\n<table>\n  <thead>\n    <tr>\n      \n
+      \     <td><strong>sample_name</strong></td>\n      \n      <td><strong>prediction</strong></td>\n
+      \     \n    </tr>\n  </thead>\n  <tbody>\n    \n    <tr>\n      \n      <td>all_zeroes</td>\n
+      \     \n      <td>0.0</td>\n      \n    </tr>\n    \n    <tr>\n      \n      <td>all_ones</td>\n
+      \     \n      <td>0.0</td>\n      \n    </tr>\n    \n  </tbody>\n</table>\n</div>\n"
+    execution_time: '00:00:02.626657'
+    cell_number: 10
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 70
+  fields:
+    notebook: 2
+    cell_type: 1
+    contents: 'Even though the inputs are not data we''ve ever seen before, the model
+      is telling us both of these new samples are likely to be benign based on their
+      statistcal correlations to the training samples we had labelled. As we collect
+      new data samples, we could potentially use this model for multiple purposes,
+      like screening the samples before doing further more expensive or invasive analysis.
+
+
+      To demonstrate a more concise call that omits all the feature names (careful
+      to get the order right):'
+    rendering: '<article class="markdown-body"><p>Even though the inputs are not data
+      we''ve ever seen before, the model is telling us both of these new samples are
+      likely to be benign based on their statistcal correlations to the training samples
+      we had labelled. As we collect new data samples, we could potentially use this
+      model for multiple purposes, like screening the samples before doing further
+      more expensive or invasive analysis.</p>
+
+      <p>To demonstrate a more concise call that omits all the feature names (careful
+      to get the order right):</p></article>'
+    execution_time: null
+    cell_number: 11
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 71
+  fields:
+    notebook: 2
+    cell_type: 3
+    contents: "SELECT pgml.predict(\n    'Breast Cancer Detection', \n    ARRAY[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100000]\n)"
+    rendering: "<div class=\"markdown-body\">\n<table>\n  <thead>\n    <tr>\n      \n
+      \     <td><strong>predict</strong></td>\n      \n    </tr>\n  </thead>\n  <tbody>\n
+      \   \n    <tr>\n      \n      <td>1.0</td>\n      \n    </tr>\n    \n  </tbody>\n</table>\n</div>\n"
+    execution_time: '00:00:02.643660'
+    cell_number: 12
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 72
+  fields:
+    notebook: 2
+    cell_type: 1
+    contents: 'Ah hah! We put a really big number into the last feature (worst fractal
+      dimension), and got the model to give us a `True` prediction, indicating that
+      large values there correlate with a malignant sample all else being equal using
+      our default linear algorithm. There are lots of ways we can probe the model
+      with test data, but before we spend too much time on this one, it might be informative
+      to try other algorithms.
+
+
+      PostgresML makes it easy to reuse your training data with many of the best algorithms
+      available. Why not try them all?'
+    rendering: '<article class="markdown-body"><p>Ah hah! We put a really big number
+      into the last feature (worst fractal dimension), and got the model to give us
+      a <code>True</code> prediction, indicating that large values there correlate
+      with a malignant sample all else being equal using our default linear algorithm.
+      There are lots of ways we can probe the model with test data, but before we
+      spend too much time on this one, it might be informative to try other algorithms.</p>
+
+      <p>PostgresML makes it easy to reuse your training data with many of the best
+      algorithms available. Why not try them all?</p></article>'
+    execution_time: null
+    cell_number: 13
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 73
+  fields:
+    notebook: 2
+    cell_type: 3
+    contents: '--
+
+      -- After a project has been trained, ommited parameters will be reused from
+      previous training runs
+
+      -- In these examples we''ll reuse the training data snapshots from the initial
+      call.
+
+      --
+
+
+      -- Linear Models
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''ridge'');
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''stochastic_gradient_descent'');
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''perceptron'');
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''passive_aggressive'');
+
+
+      -- Support Vector Machines
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''svm'');
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''nu_svm'');
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''linear_svm'');
+
+
+      -- Ensembles
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''ada_boost'');
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''bagging'');
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''extra_trees'',
+      hyperparams => ''{"n_estimators": 10}'');
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''gradient_boosting_trees'',
+      hyperparams => ''{"n_estimators": 10}'');
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''random_forest'',
+      hyperparams => ''{"n_estimators": 10}'');
+
+
+      -- Gradient Boosting
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''xgboost'',
+      hyperparams => ''{"n_estimators": 10}'');
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''xgboost_random_forest'',
+      hyperparams => ''{"n_estimators": 10}'');
+
+      SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''lightgbm'',
+      hyperparams => ''{"n_estimators": 1}'');'
+    rendering: "<div class=\"markdown-body\">\n<table>\n  <thead>\n    <tr>\n      \n
+      \     <td><strong>project_name</strong></td>\n      \n      <td><strong>task</strong></td>\n
+      \     \n      <td><strong>algorithm_name</strong></td>\n      \n      <td><strong>status</strong></td>\n
+      \     \n    </tr>\n  </thead>\n  <tbody>\n    \n    <tr>\n      \n      <td>Breast
+      Cancer Detection</td>\n      \n      <td>None</td>\n      \n      <td>lightgbm</td>\n
+      \     \n      <td>not deployed</td>\n      \n    </tr>\n    \n  </tbody>\n</table>\n</div>\n"
+    execution_time: '00:00:03.250016'
+    cell_number: 14
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 74
+  fields:
+    notebook: 2
+    cell_type: 1
+    contents: "Turns out, computers are pretty fast these days, even with state of
+      the art algorithms running on a free tier computation resources. \U0001F60A
+      \n\nYou can pop over to the [projects](/projects) tab for a visualization of
+      the performance of all these algorithms on this dataset, or you can check out
+      the artifacts directly in the database."
+    rendering: "<article class=\"markdown-body\"><p>Turns out, computers are pretty
+      fast these days, even with state of the art algorithms running on a free tier
+      computation resources. \U0001F60A </p>\n<p>You can pop over to the <a href=\"/projects\">projects</a>
+      tab for a visualization of the performance of all these algorithms on this dataset,
+      or you can check out the artifacts directly in the database.</p></article>"
+    execution_time: null
+    cell_number: 15
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 75
+  fields:
+    notebook: 2
+    cell_type: 3
+    contents: "SELECT \n  projects.name,\n  models.algorithm_name,\n  round((models.metrics->>'f1')::numeric,
+      4) AS f1_score,\n  round((models.metrics->>'precision')::numeric, 4) AS precision,\n
+      \ round((models.metrics->>'recall')::numeric, 4) AS recall\nFROM pgml.models\nJOIN
+      pgml.projects on projects.id = models.project_id\n  AND projects.name = 'Breast
+      Cancer Detection'\nORDER BY models.metrics->>'f1' DESC LIMIT 5;"
+    rendering: "<div class=\"markdown-body\">\n<table>\n  <thead>\n    <tr>\n      \n
+      \     <td><strong>name</strong></td>\n      \n      <td><strong>algorithm_name</strong></td>\n
+      \     \n      <td><strong>f1_score</strong></td>\n      \n      <td><strong>precision</strong></td>\n
+      \     \n      <td><strong>recall</strong></td>\n      \n    </tr>\n  </thead>\n
+      \ <tbody>\n    \n    <tr>\n      \n      <td>Breast Cancer Detection</td>\n
+      \     \n      <td>xgboost</td>\n      \n      <td>0.9860</td>\n      \n      <td>0.9863</td>\n
+      \     \n      <td>0.9860</td>\n      \n    </tr>\n    \n    <tr>\n      \n      <td>Breast
+      Cancer Detection</td>\n      \n      <td>random_forest</td>\n      \n      <td>0.9860</td>\n
+      \     \n      <td>0.9863</td>\n      \n      <td>0.9860</td>\n      \n    </tr>\n
+      \   \n    <tr>\n      \n      <td>Breast Cancer Detection</td>\n      \n      <td>xgboost_random_forest</td>\n
+      \     \n      <td>0.9790</td>\n      \n      <td>0.9791</td>\n      \n      <td>0.9790</td>\n
+      \     \n    </tr>\n    \n    <tr>\n      \n      <td>Breast Cancer Detection</td>\n
+      \     \n      <td>gradient_boosting_trees</td>\n      \n      <td>0.9790</td>\n
+      \     \n      <td>0.9791</td>\n      \n      <td>0.9790</td>\n      \n    </tr>\n
+      \   \n    <tr>\n      \n      <td>Breast Cancer Detection</td>\n      \n      <td>ridge</td>\n
+      \     \n      <td>0.9789</td>\n      \n      <td>0.9797</td>\n      \n      <td>0.9790</td>\n
+      \     \n    </tr>\n    \n  </tbody>\n</table>\n</div>\n"
+    execution_time: '00:00:00.002094'
+    cell_number: 16
+    version: 1
+    deleted_at: null
+- model: notebooks.notebookcell
+  pk: 76
+  fields:
+    notebook: 2
+    cell_type: 1
+    contents: Tree based algorithms like `random_forest`, `xgboost` and `lightgbm`
+      do well on tabular datasets and frequently lead the pack with A+ level performance
+      as measured by the `f1_score`. They are generally sensitive to small changes
+      in the inputs, but also robust to outliers. They are also relatively fast algorithms
+      that can perform predictions in sub millisecond times, meaning most of the cost
+      of inference is in fetching the data they require as inputs. When your inputs
+      are already in the database with the model, that time is as fast as possible!
+    rendering: <article class="markdown-body"><p>Tree based algorithms like <code>random_forest</code>,
+      <code>xgboost</code> and <code>lightgbm</code> do well on tabular datasets and
+      frequently lead the pack with A+ level performance as measured by the <code>f1_score</code>.
+      They are generally sensitive to small changes in the inputs, but also robust
+      to outliers. They are also relatively fast algorithms that can perform predictions
+      in sub millisecond times, meaning most of the cost of inference is in fetching
+      the data they require as inputs. When your inputs are already in the database
+      with the model, that time is as fast as possible!</p></article>
+    execution_time: null
+    cell_number: 17
+    version: 1
+    deleted_at: null

mean radius	mean + texture	mean perimeter	mean area	mean + smoothness	mean compactness	mean concavity	mean + concave points	mean symmetry	mean fractal dimension	radius + error	texture error	perimeter error	area + error	smoothness error	compactness error	concavity + error	concave points error	symmetry error	fractal + dimension error	worst radius	worst texture	worst + perimeter	worst area	worst smoothness	worst + compactness	worst concavity	worst concave points	worst + symmetry	worst fractal dimension	malignant
12.77	21.41	82.02	507.4	0.08749	0.06601	0.03112	0.02864	0.1694	0.06287	0.7311	1.748	5.118	53.65	0.004571	0.0179	0.02176	0.01757	0.03373	0.005875	13.75	23.5	89.04	579.5	0.09388	0.08978	0.05186	0.04773	0.2179	0.06871	False
12.22	20.04	79.47	453.1	0.1096	0.1152	0.08175	0.02166	0.2124	0.06894	0.1811	0.7959	0.9857	12.58	0.006272	0.02198	0.03966	0.009894	0.0132	0.003813	13.16	24.17	85.13	515.3	0.1402	0.2315	0.3535	0.08088	0.2709	0.08839	False
12.4	17.68	81.47	467.8	0.1054	0.1316	0.07741	0.02799	0.1811	0.07102	0.1767	1.46	2.204	15.43	0.01	0.03295	0.04861	0.01167	0.02187	0.006005	12.88	22.91	89.61	515.8	0.145	0.2629	0.2403	0.0737	0.2556	0.09359	False
14.02	15.66	89.59	606.5	0.07966	0.05581	0.02087	0.02652	0.1589	0.05586	0.2142	0.6549	1.606	19.25	0.004837	0.009238	0.009213	0.01076	0.01171	0.002104	14.91	19.31	96.53	688.9	0.1034	0.1017	0.0626	0.08216	0.2136	0.0671	False
19.59	18.15	130.7	1214.0	0.112	0.1666	0.2508	0.1286	0.2027	0.06082	0.7364	1.048	4.792	97.07	0.004057	0.02277	0.04029	0.01303	0.01686	0.003318	26.73	26.39	174.9	2232.0	0.1438	0.3846	0.681	0.2247	0.3643	0.09223	True
8.726	15.83	55.84	230.9	0.115	0.08201	0.04132	0.01924	0.1649	0.07633	0.1665	0.5864	1.354	8.966	0.008261	0.02213	0.03259	0.0104	0.01708	0.003806	9.628	19.62	64.48	284.4	0.1724	0.2364	0.2456	0.105	0.2926	0.1017	False
17.99	10.38	122.8	1001.0	0.1184	0.2776	0.3001	0.1471	0.2419	0.07871	1.095	0.9053	8.589	153.4	0.006399	0.04904	0.05373	0.01587	0.03003	0.006193	25.38	17.33	184.6	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.1189	True
11.74	14.69	76.31	426.0	0.08099	0.09661	0.06726	0.02639	0.1499	0.06758	0.1924	0.6417	1.345	13.04	0.006982	0.03916	0.04017	0.01528	0.0226	0.006822	12.45	17.6	81.25	473.8	0.1073	0.2793	0.269	0.1056	0.2604	0.09879	False
9.667	18.49	61.49	289.1	0.08946	0.06258	0.02948	0.01514	0.2238	0.06413	0.3776	1.35	2.569	22.73	0.007501	0.01989	0.02714	0.009883	0.0196	0.003913	11.14	25.62	70.88	385.2	0.1234	0.1542	0.1277	0.0656	0.3174	0.08524	False
13.08	15.71	85.63	520.0	0.1075	0.127	0.04568	0.0311	0.1967	0.06811	0.1852	0.7477	1.383	14.67	0.004097	0.01898	0.01698	0.00649	0.01678	0.002425	14.5	20.49	96.09	630.5	0.1312	0.2776	0.189	0.07283	0.3184	0.08183	False
project_name	task	algorithm_name	status
Breast + Cancer Detection	classification	linear	not deployed
malignant	prediction
False	0.0
True	1.0
False	0.0
False	0.0
False	0.0
False	0.0
False	0.0
True	1.0
False	0.0
True	1.0
name	algorithm_name	f1_score	precision	recall
Breast Cancer Detection	xgboost	0.9860	0.9863	0.9860
Breast + Cancer Detection	random_forest	0.9860	0.9863	0.9860
Breast Cancer Detection	xgboost_random_forest	0.9790	0.9791	0.9790
Breast Cancer Detection	gradient_boosting_trees	0.9790	0.9791	0.9790
Breast Cancer Detection	ridge	0.9789	0.9797	0.9790