Linear regression analysis scatter-plot on synthetic data

This page documents a preview version. v2.23 Preview
Preview includes features under active development and is for development and testing only.
For production, use the latest stable version (v2024.1).

Save this script as "synthetic-data.sql"

drop procedure if exists populate_t(
  int,  double precision,  double precision,  double precision,  double precision)
  cascade;

drop table if exists t cascade;
create table t(
  k      int primary key,
  x      double precision,
  y      double precision,
  delta  double precision);

create procedure populate_t(
  no_of_rows  in int,
  slope       in double precision,
  intercept   in double precision,
  mean        in double precision,
  stddev      in double precision)
  language plpgsql
as $body$
begin
  delete from t;

  with
    a1 as (
      select
        s.v        as k,
        s.v        as x,
        (s.v * slope) + intercept as y
      from generate_series(1, no_of_rows) as s(v)),

    a2 as (
      select (
        row_number() over()) as k,
        r.v as delta
      from normal_rand(no_of_rows, mean, stddev) as r(v))

  insert into t(k, x, y, delta)
  select
    k, x, a1.y, a2.delta
  from a1 inner join a2 using(k);

  insert into t(k, x, y, delta) values
    (no_of_rows + 1,    0, null, null),
    (no_of_rows + 2, null,    0, null);
end;
$body$;

call populate_t(
  no_of_rows  => 100,
  mean        =>  0.0,
  stddev      => 5.0,
  slope       =>  -1.2,
  intercept   =>  131.4);

\o analysis-results/synthetic-data.txt
with a as (
  select
    regr_r2       ((y + delta), x) as r2,
    regr_slope    ((y + delta), x) as s,
    regr_intercept((y + delta), x) as i
  from t)
select
  to_char(r2, '0.99') as r2,
  to_char(s,  '90.9') as s,
  to_char(i, '990.9') as i
from a;
\o

\t on
\o analysis-results/synthetic-data.csv
select
  round(x)::text||','||round(y + delta)::text
from t
where
  x > 60        and
  x < 95        and
  x is not null and
  y is not null
order by x;
\o
\t off