Online News (UCI, Regression, n=39644, d=58)

Loading The Data

In [1]:
from kxy_datasets.uci_regressions import OnlineNews # pip install kxy_datasets
In [2]:
dataset = OnlineNews()
df = dataset.df # Retrieve the dataset as a pandas dataframe
y_column = dataset.y_column # The name of the column corresponding to the target
problem_type = dataset.problem_type # 'regression' or 'classification'
In [3]:
df.kxy.describe() # Visualize a summary of the data

---------------
Column:  LDA_00
---------------
Type:   Continuous
Max:    0.9
p75:    0.2
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

---------------
Column:  LDA_01
---------------
Type:   Continuous
Max:    0.9
p75:    0.2
Mean:   0.1
Median: 0.0
p25:    0.0
Min:    0.0

---------------
Column:  LDA_02
---------------
Type:   Continuous
Max:    0.9
p75:    0.3
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

---------------
Column:  LDA_03
---------------
Type:   Continuous
Max:    0.9
p75:    0.4
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

---------------
Column:  LDA_04
---------------
Type:   Continuous
Max:    0.9
p75:    0.4
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

-------------------------------------
Column:  abs_title_sentiment_polarity
-------------------------------------
Type:   Continuous
Max:    1.0
p75:    0.2
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

-------------------------------
Column:  abs_title_subjectivity
-------------------------------
Type:   Continuous
Max:    0.5
p75:    0.5
Mean:   0.3
Median: 0.5
p25:    0.2
Min:    0.0

-----------------------------
Column:  average_token_length
-----------------------------
Type:   Continuous
Max:    8.0
p75:    4.9
Mean:   4.5
Median: 4.7
p25:    4.5
Min:    0.0

------------------------------
Column:  avg_negative_polarity
------------------------------
Type:   Continuous
Max:    0.0
p75:    -0.2
Mean:   -0.3
Median: -0.3
p25:    -0.3
Min:    -1.0

------------------------------
Column:  avg_positive_polarity
------------------------------
Type:   Continuous
Max:    1.0
p75:    0.4
Mean:   0.4
Median: 0.4
p25:    0.3
Min:    0.0

----------------------------
Column:  data_channel_is_bus
----------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

--------------------------------------
Column:  data_channel_is_entertainment
--------------------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

----------------------------------
Column:  data_channel_is_lifestyle
----------------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.1
Median: 0.0
p25:    0.0
Min:    0.0

-------------------------------
Column:  data_channel_is_socmed
-------------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.1
Median: 0.0
p25:    0.0
Min:    0.0

-----------------------------
Column:  data_channel_is_tech
-----------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

------------------------------
Column:  data_channel_is_world
------------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

-----------------------------------
Column:  global_rate_negative_words
-----------------------------------
Type:   Continuous
Max:    0.2
p75:    0.0
Mean:   0.0
Median: 0.0
p25:    0.0
Min:    0.0

-----------------------------------
Column:  global_rate_positive_words
-----------------------------------
Type:   Continuous
Max:    0.2
p75:    0.1
Mean:   0.0
Median: 0.0
p25:    0.0
Min:    0.0

----------------------------------
Column:  global_sentiment_polarity
----------------------------------
Type:   Continuous
Max:    0.7
p75:    0.2
Mean:   0.1
Median: 0.1
p25:    0.1
Min:    -0.4

----------------------------
Column:  global_subjectivity
----------------------------
Type:   Continuous
Max:    1.0
p75:    0.5
Mean:   0.4
Median: 0.5
p25:    0.4
Min:    0.0

-------------------
Column:  is_weekend
-------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.1
Median: 0.0
p25:    0.0
Min:    0.0

-------------------
Column:  kw_avg_avg
-------------------
Type:   Continuous
Max:    43,567
p75:    3,600
Mean:   3,135
Median: 2,870
p25:    2,382
Min:    0.0

-------------------
Column:  kw_avg_max
-------------------
Type:   Continuous
Max:    843,300
p75:    330,980
Mean:   259,281
Median: 244,572
p25:    172,846
Min:    0.0

-------------------
Column:  kw_avg_min
-------------------
Type:   Continuous
Max:    42,827
p75:    357
Mean:   312
Median: 235
p25:    141
Min:    -1.0

-------------------
Column:  kw_max_avg
-------------------
Type:   Continuous
Max:    298,400
p75:    6,019
Mean:   5,657
Median: 4,355
p25:    3,562
Min:    0.0

-------------------
Column:  kw_max_max
-------------------
Type:   Continuous
Max:    843,300
p75:    843,300
Mean:   752,324
Median: 843,300
p25:    843,300
Min:    0.0

-------------------
Column:  kw_max_min
-------------------
Type:   Continuous
Max:    298,400
p75:    1,000
Mean:   1,153
Median: 660
p25:    445
Min:    0.0

-------------------
Column:  kw_min_avg
-------------------
Type:   Continuous
Max:    3,613
p75:    2,056
Mean:   1,117
Median: 1,023
p25:    0.0
Min:    -1.0

-------------------
Column:  kw_min_max
-------------------
Type:   Continuous
Max:    843,300
p75:    7,900
Mean:   13,612
Median: 1,400
p25:    0.0
Min:    0.0

-------------------
Column:  kw_min_min
-------------------
Type:   Continuous
Max:    377
p75:    4.0
Mean:   26
Median: -1.0
p25:    -1.0
Min:    -1.0

------------------------------
Column:  max_negative_polarity
------------------------------
Type:   Continuous
Max:    0.0
p75:    -0.1
Mean:   -0.1
Median: -0.1
p25:    -0.1
Min:    -1.0

------------------------------
Column:  max_positive_polarity
------------------------------
Type:   Continuous
Max:    1.0
p75:    1.0
Mean:   0.8
Median: 0.8
p25:    0.6
Min:    0.0

------------------------------
Column:  min_negative_polarity
------------------------------
Type:   Continuous
Max:    0.0
p75:    -0.3
Mean:   -0.5
Median: -0.5
p25:    -0.7
Min:    -1.0

------------------------------
Column:  min_positive_polarity
------------------------------
Type:   Continuous
Max:    1.0
p75:    0.1
Mean:   0.1
Median: 0.1
p25:    0.1
Min:    0.0

---------------------------------
Column:  n_non_stop_unique_tokens
---------------------------------
Type:   Continuous
Max:    650
p75:    0.8
Mean:   0.7
Median: 0.7
p25:    0.6
Min:    0.0

-------------------------
Column:  n_non_stop_words
-------------------------
Type:   Continuous
Max:    1,042
p75:    1.0
Mean:   1.0
Median: 1.0
p25:    1.0
Min:    0.0

-------------------------
Column:  n_tokens_content
-------------------------
Type:   Continuous
Max:    8,474
p75:    716
Mean:   546
Median: 409
p25:    246
Min:    0.0

-----------------------
Column:  n_tokens_title
-----------------------
Type:   Continuous
Max:    23
p75:    12
Mean:   10
Median: 10
p25:    9.0
Min:    2.0

------------------------
Column:  n_unique_tokens
------------------------
Type:   Continuous
Max:    701
p75:    0.6
Mean:   0.5
Median: 0.5
p25:    0.5
Min:    0.0

------------------
Column:  num_hrefs
------------------
Type:   Continuous
Max:    304
p75:    14
Mean:   10
Median: 8.0
p25:    4.0
Min:    0.0

-----------------
Column:  num_imgs
-----------------
Type:   Continuous
Max:    128
p75:    4.0
Mean:   4.5
Median: 1.0
p25:    1.0
Min:    0.0

---------------------
Column:  num_keywords
---------------------
Type:   Continuous
Max:    10
p75:    9.0
Mean:   7.2
Median: 7.0
p25:    6.0
Min:    1.0

-----------------------
Column:  num_self_hrefs
-----------------------
Type:   Continuous
Max:    116
p75:    4.0
Mean:   3.3
Median: 3.0
p25:    1.0
Min:    0.0

-------------------
Column:  num_videos
-------------------
Type:   Continuous
Max:    91
p75:    1.0
Mean:   1.2
Median: 0.0
p25:    0.0
Min:    0.0

----------------------------
Column:  rate_negative_words
----------------------------
Type:   Continuous
Max:    1.0
p75:    0.4
Mean:   0.3
Median: 0.3
p25:    0.2
Min:    0.0

----------------------------
Column:  rate_positive_words
----------------------------
Type:   Continuous
Max:    1.0
p75:    0.8
Mean:   0.7
Median: 0.7
p25:    0.6
Min:    0.0

-----------------------------------
Column:  self_reference_avg_sharess
-----------------------------------
Type:   Continuous
Max:    843,300
p75:    5,200
Mean:   6,401
Median: 2,200
p25:    981
Min:    0.0

----------------------------------
Column:  self_reference_max_shares
----------------------------------
Type:   Continuous
Max:    843,300
p75:    8,000
Mean:   10,329
Median: 2,800
p25:    1,100
Min:    0.0

----------------------------------
Column:  self_reference_min_shares
----------------------------------
Type:   Continuous
Max:    843,300
p75:    2,600
Mean:   3,998
Median: 1,200
p25:    639
Min:    0.0

---------------
Column:  shares
---------------
Type:   Continuous
Max:    843,300
p75:    2,800
Mean:   3,395
Median: 1,400
p25:    946
Min:    1.0

---------------------------------
Column:  title_sentiment_polarity
---------------------------------
Type:   Continuous
Max:    1.0
p75:    0.1
Mean:   0.1
Median: 0.0
p25:    0.0
Min:    -1.0

---------------------------
Column:  title_subjectivity
---------------------------
Type:   Continuous
Max:    1.0
p75:    0.5
Mean:   0.3
Median: 0.1
p25:    0.0
Min:    0.0

--------------------------
Column:  weekday_is_friday
--------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.1
Median: 0.0
p25:    0.0
Min:    0.0

--------------------------
Column:  weekday_is_monday
--------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

----------------------------
Column:  weekday_is_saturday
----------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.1
Median: 0.0
p25:    0.0
Min:    0.0

--------------------------
Column:  weekday_is_sunday
--------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.1
Median: 0.0
p25:    0.0
Min:    0.0

----------------------------
Column:  weekday_is_thursday
----------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

---------------------------
Column:  weekday_is_tuesday
---------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

-----------------------------
Column:  weekday_is_wednesday
-----------------------------
Type:   Continuous
Max:    1.0
p75:    0.0
Mean:   0.2
Median: 0.0
p25:    0.0
Min:    0.0

Data Valuation

In [4]:
df.kxy.data_valuation(y_column, problem_type=problem_type)
[====================================================================================================] 100% ETA: 0s    Duration: 0s
Out[4]:
Achievable R-Squared Achievable Log-Likelihood Per Sample Achievable RMSE
0 1.00 -9.52 219

Automatic (Model-Free) Variable Selection

In [5]:
df.kxy.variable_selection(y_column, problem_type=problem_type)
[====================================================================================================] 100% ETA: 0s    Duration: 0s
Out[5]:
Variable Running Achievable R-Squared Running Achievable RMSE
Selection Order
0 No Variable 0.00 1.16e+04
1 kw_avg_avg 0.06 1.13e+04
2 self_reference_avg_sharess 0.07 1.12e+04
3 kw_avg_max 0.08 1.12e+04
4 data_channel_is_entertainment 0.10 1.10e+04
5 is_weekend 0.10 1.10e+04
6 kw_min_avg 0.10 1.10e+04
7 data_channel_is_world 0.16 1.07e+04
8 kw_max_max 0.16 1.07e+04
9 n_unique_tokens 0.16 1.07e+04
10 self_reference_max_shares 0.16 1.07e+04
11 data_channel_is_lifestyle 0.21 1.03e+04
12 LDA_04 0.25 1.00e+04
13 LDA_00 0.30 9.74e+03
14 num_hrefs 0.37 9.26e+03
15 num_videos 0.43 8.81e+03
16 data_channel_is_bus 0.48 8.39e+03
17 kw_min_min 0.48 8.39e+03
18 min_positive_polarity 0.48 8.39e+03
19 global_subjectivity 0.48 8.39e+03
20 n_non_stop_words 0.80 5.18e+03
21 num_imgs 0.92 3.20e+03
22 n_tokens_content 0.97 1.98e+03
23 n_non_stop_unique_tokens 0.99 9.49e+02
24 num_keywords 1.00 4.56e+02
25 data_channel_is_socmed 1.00 2.19e+02
26 weekday_is_wednesday 1.00 2.19e+02
27 kw_avg_min 1.00 2.19e+02
28 num_self_hrefs 1.00 2.19e+02
29 weekday_is_tuesday 1.00 2.19e+02
30 data_channel_is_tech 1.00 2.19e+02
31 weekday_is_thursday 1.00 2.19e+02
32 kw_max_avg 1.00 2.19e+02
33 global_sentiment_polarity 1.00 2.19e+02
34 title_sentiment_polarity 1.00 2.19e+02
35 abs_title_subjectivity 1.00 2.19e+02
36 LDA_02 1.00 2.19e+02
37 self_reference_min_shares 1.00 2.19e+02
38 title_subjectivity 1.00 2.19e+02
39 global_rate_negative_words 1.00 2.19e+02
40 rate_positive_words 1.00 2.19e+02
41 global_rate_positive_words 1.00 2.19e+02
42 weekday_is_monday 1.00 2.19e+02
43 abs_title_sentiment_polarity 1.00 2.19e+02
44 n_tokens_title 1.00 2.19e+02
45 kw_max_min 1.00 2.19e+02
46 average_token_length 1.00 2.19e+02
47 LDA_01 1.00 2.19e+02
48 min_negative_polarity 1.00 2.19e+02
49 avg_negative_polarity 1.00 2.19e+02
50 max_negative_polarity 1.00 2.19e+02
51 LDA_03 1.00 2.19e+02
52 max_positive_polarity 1.00 2.19e+02
53 avg_positive_polarity 1.00 2.19e+02
54 kw_min_max 1.00 2.19e+02
55 weekday_is_saturday 1.00 2.19e+02
56 weekday_is_sunday 1.00 2.19e+02
57 weekday_is_friday 1.00 2.19e+02
58 rate_negative_words 1.00 2.19e+02