# Toy 1D Regression Examples (Small n)¶

In [1]:

%load_ext autoreload

import pylab as plt
import pandas as pd
import numpy as np
np.random.seed(0)
import kxy

In [2]:

# Sample size
n = 100
x = np.arange(-1., 1.01, 1./(2*n))

# Noiseless functions
names = [r'''$y=x$''', r'''$y=\sqrt{|x|}$''', r'''$y=-x^3$''', r'''$y=\tanh\left(\frac{5}{2}u\right)$''']
fs = [lambda u: u, lambda u: -np.sqrt(np.abs(u)), lambda u: (-u)**3, lambda u: np.tanh(2.5*u)]

# Noise configurations
rsqs = np.array([1., .99, .75, .50, .25]) # Desired  Exact R^2
err_var = 1./rsqs-1.
err_std = np.sqrt(err_var) # Implied Exact RMSE assuming

# Generate the data
data = [[]]*len(rsqs)
for i in range(len(rsqs)):
dfl = []
for j in range(len(fs)):
y_ = fs[j](x)
y = y_/y_.std()
y = y + err_std[i]*np.random.randn(x.shape[0])
z = np.concatenate([y[:, None], x[:, None]], axis=1)
df = pd.DataFrame(z.copy(), columns=['y', 'x'])
dfl += [df.copy()]

data[i] = dfl


## Data Valuation¶

In [3]:

estimated_rsqs = [[None for j in range(len(fs))] for i in range(len(rsqs))]
estimated_rmses = [[None for j in range(len(fs))] for i in range(len(rsqs))]

for i in range(len(rsqs)):
print('\n\n')
print(r'''-------------------------------------''')
print(r'''Exact $R^2$: %.2f, Exact RMSE: %.2f''' % (rsqs[i], err_std[i]))
print(r'''-------------------------------------''')
for j in range(len(fs)):
print()
print('KxY estimation for %s' % names[j])
dv = data[i][j].kxy.data_valuation('y', problem_type='regression')
estimated_rsqs[i][j] = float(dv['Achievable R-Squared'][0])
estimated_rmses[i][j] = float(dv['Achievable RMSE'][0])
print(dv)



-------------------------------------
Exact $R^2$: 1.00, Exact RMSE: 0.00
-------------------------------------

KxY estimation for $y=x$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 1.00                                 2.11        2.96e-02

KxY estimation for $y=\sqrt{|x|}$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 1.00                                 4.31        3.30e-03

KxY estimation for $y=-x^3$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 1.00                                 4.30        3.95e-03

KxY estimation for $y=\tanh\left(\frac{5}{2}u\right)$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 1.00                                 3.75        6.64e-03

-------------------------------------
Exact $R^2$: 0.99, Exact RMSE: 0.10
-------------------------------------

KxY estimation for $y=x$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.99                             8.11e-01        1.08e-01

KxY estimation for $y=\sqrt{|x|}$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.99                             7.53e-01        1.15e-01

KxY estimation for $y=-x^3$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.99                             8.54e-01        1.20e-01

KxY estimation for $y=\tanh\left(\frac{5}{2}u\right)$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.97                             5.20e-01        1.66e-01

-------------------------------------
Exact $R^2$: 0.75, Exact RMSE: 0.58
-------------------------------------

KxY estimation for $y=x$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.76                            -8.97e-01        5.74e-01

KxY estimation for $y=\sqrt{|x|}$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.73                            -9.31e-01        5.97e-01

KxY estimation for $y=-x^3$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.75                            -8.55e-01        5.70e-01

KxY estimation for $y=\tanh\left(\frac{5}{2}u\right)$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.75                            -8.85e-01        5.78e-01

-------------------------------------
Exact $R^2$: 0.50, Exact RMSE: 1.00
-------------------------------------

KxY estimation for $y=x$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.52                                -1.41        9.52e-01

KxY estimation for $y=\sqrt{|x|}$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.56                                -1.38        9.26e-01

KxY estimation for $y=-x^3$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.57                                -1.41        9.52e-01

KxY estimation for $y=\tanh\left(\frac{5}{2}u\right)$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.51                                -1.44        9.82e-01

-------------------------------------
Exact $R^2$: 0.25, Exact RMSE: 1.73
-------------------------------------

KxY estimation for $y=x$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.33                                -1.99            1.70

KxY estimation for $y=\sqrt{|x|}$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.40                                -1.80            1.40

KxY estimation for $y=-x^3$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.35                                -1.96            1.66

KxY estimation for $y=\tanh\left(\frac{5}{2}u\right)$
[====================================================================================================] 100% ETA: 0s
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0                 0.35                                -1.84            1.47

In [4]:

fig, axes = plt.subplots(len(rsqs), len(fs), figsize=(20, 20))
for i in range(len(rsqs)):
for j in range(len(fs)):
df = data[i][j].copy()
y = df['y'].values
axes[i, j].plot(x, y, '.')
axes[i, j].set_xticks(())
axes[i, j].set_yticks(())
if i == 0:
axes[i, j].set_title(names[j], fontsize=15)
else:
axes[i, j].set_title(r'''Estimated $R^2$= %.2f''' % estimated_rsqs[i][j], fontsize=15)
if j == 0:
axes[i, j].set_ylabel(r'''Exact $R^2$= %.2f''' % rsqs[i], fontsize=15)

In [5]:

fig, axes = plt.subplots(len(rsqs), len(fs), figsize=(20, 20))
for i in range(len(rsqs)):
for j in range(len(fs)):
df = data[i][j].copy()
y = df['y'].values
axes[i, j].plot(x, y, '.')
axes[i, j].set_xticks(())
axes[i, j].set_yticks(())
if i == 0:
axes[i, j].set_title(names[j], fontsize=15)
else:
axes[i, j].set_title(r'''Estimated RMSE= %.2f''' % estimated_rmses[i][j], fontsize=15)
if j == 0:
axes[i, j].set_ylabel(r'''Exact RMSE= %.2f''' % err_std[i], fontsize=15)