🌐 AI搜索 & 代理 主页
Skip to content

Commit a8d8218

Browse files
authored
fix and test preprocessing examples (#1520)
1 parent c3a8514 commit a8d8218

File tree

14 files changed

+70
-26
lines changed

14 files changed

+70
-26
lines changed

.github/workflows/ubuntu-packages-and-docker-image.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ on:
44
workflow_dispatch:
55
inputs:
66
packageVersion:
7-
default: "2.8.2"
7+
default: "2.9.1"
88
jobs:
99
#
1010
# PostgresML extension.

pgml-cms/docs/resources/developer-docs/contributing.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ SELECT pgml.version();
127127
postgres=# select pgml.version();
128128
version
129129
-------------------
130-
2.7.4
130+
2.9.1
131131
(1 row)
132132
```
133133
{% endtab %}

pgml-cms/docs/resources/developer-docs/installation.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ CREATE EXTENSION
132132
pgml_test=# SELECT pgml.version();
133133
version
134134
---------
135-
2.7.4
135+
2.9.1
136136
(1 row)
137137
```
138138

pgml-cms/docs/resources/developer-docs/quick-start-with-docker.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ Time: 41.520 ms
8080
postgresml=# SELECT pgml.version();
8181
version
8282
---------
83-
2.7.13
83+
2.9.1
8484
(1 row)
8585
```
8686

pgml-cms/docs/resources/developer-docs/self-hosting/pooler.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,6 @@ Type "help" for help.
115115
postgresml=> SELECT pgml.version();
116116
version
117117
---------
118-
2.7.9
118+
2.9.1
119119
(1 row)
120120
```

pgml-extension/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pgml-extension/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "pgml"
3-
version = "2.9.0"
3+
version = "2.9.1"
44
edition = "2021"
55

66
[lib]
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
-- load the diamonds dataset, that contains text categorical variables
2+
SELECT pgml.load_dataset('jdxcosta/diamonds');
3+
4+
-- view the data
5+
SELECT * FROM pgml."jdxcosta/diamonds" LIMIT 10;
6+
7+
-- drop the Unamed column, since it's not useful for training (you could create a view instead)
8+
ALTER TABLE pgml."jdxcosta/diamonds" DROP COLUMN "Unnamed: 0";
9+
10+
-- train a model using preprocessors to scale the numeric variables, and target encode the categoricals
11+
SELECT pgml.train(
12+
project_name => 'Diamond prices',
13+
task => 'regression',
14+
relation_name => 'pgml.jdxcosta/diamonds',
15+
y_column_name => 'price',
16+
algorithm => 'lightgbm',
17+
preprocess => '{
18+
"carat": {"scale": "standard"},
19+
"depth": {"scale": "standard"},
20+
"table": {"scale": "standard"},
21+
"cut": {"encode": "target", "scale": "standard"},
22+
"color": {"encode": "target", "scale": "standard"},
23+
"clarity": {"encode": "target", "scale": "standard"}
24+
}'
25+
);
26+
27+
-- run some predictions, notice we're passing a heterogeneous row (tuple) as input, rather than a homogenous ARRAY[].
28+
SELECT price, pgml.predict('Diamond prices', (carat, cut, color, clarity, depth, "table", x, y, z)) AS prediction
29+
FROM pgml."jdxcosta/diamonds"
30+
LIMIT 10;
31+
32+
-- This is a difficult dataset for more algorithms, which makes it a good challenge for preprocessing, and additional
33+
-- feature engineering. What's next?

pgml-extension/sql/pgml--2.9.0--2.9.1.sql

Whitespace-only changes.

pgml-extension/src/bindings/transformers/mod.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ pub fn load_dataset(
380380
.ok_or(anyhow!("dataset `data` key is not an object"))?;
381381
let column_names = types
382382
.iter()
383-
.map(|(name, _type)| name.clone())
383+
.map(|(name, _type)| format!("\"{}\"", name))
384384
.collect::<Vec<String>>()
385385
.join(", ");
386386
let column_types = types
@@ -393,13 +393,14 @@ pub fn load_dataset(
393393
"int64" => "INT8",
394394
"int32" => "INT4",
395395
"int16" => "INT2",
396+
"int8" => "INT2",
396397
"float64" => "FLOAT8",
397398
"float32" => "FLOAT4",
398399
"float16" => "FLOAT4",
399400
"bool" => "BOOLEAN",
400401
_ => bail!("unhandled dataset feature while reading dataset: {type_}"),
401402
};
402-
Ok(format!("{name} {type_}"))
403+
Ok(format!("\"{name}\" {type_}"))
403404
})
404405
.collect::<Result<Vec<String>>>()?
405406
.join(", ");
@@ -455,7 +456,7 @@ pub fn load_dataset(
455456
.into_datum(),
456457
)),
457458
"dict" | "list" => row.push((PgBuiltInOids::JSONBOID.oid(), JsonB(value.clone()).into_datum())),
458-
"int64" | "int32" | "int16" => row.push((
459+
"int64" | "int32" | "int16" | "int8" => row.push((
459460
PgBuiltInOids::INT8OID.oid(),
460461
value
461462
.as_i64()

0 commit comments

Comments
 (0)