Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit cae3ce0

Browse files
committedJun 18, 2022
add dimensionality reduction using feature selection tutorial
1 parent ccd6b89 commit cae3ce0

File tree

4 files changed

+446
-0
lines changed

4 files changed

+446
-0
lines changed
 
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# [Dimensionality Reduction Using Feature Selection in Python](https://www.thepythoncode.com/article/dimensionality-reduction-feature-selection)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"id": "iImkWEpRSiRq"
8+
},
9+
"outputs": [],
10+
"source": [
11+
"\n",
12+
"# Load libraries\n",
13+
"import pandas as pd\n",
14+
"import numpy as np\n",
15+
"from sklearn.datasets import load_iris, make_regression\n",
16+
"from sklearn.feature_selection import SelectKBest, chi2, f_classif, SelectPercentile, VarianceThreshold, RFECV\n",
17+
"from sklearn.preprocessing import StandardScaler\n",
18+
"import warnings\n",
19+
"from sklearn import datasets, linear_model"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": null,
25+
"metadata": {
26+
"colab": {
27+
"base_uri": "https://localhost:8080/"
28+
},
29+
"id": "ZEK7KAyzSokS",
30+
"outputId": "7ce72382-c116-4f51-df7b-1f975c1c25f8"
31+
},
32+
"outputs": [],
33+
"source": [
34+
"# Load libraries\n",
35+
"# import data\n",
36+
"iris = datasets.load_iris()\n",
37+
"# Create features and target\n",
38+
"features_i = iris.data\n",
39+
"target_i = iris.target\n",
40+
"# thresholder creation\n",
41+
"thresholder = VarianceThreshold(threshold=.4)\n",
42+
"# high variance feature matrix creation\n",
43+
"f_high_variance = thresholder.fit_transform(features_i)\n",
44+
"# View high variance feature matrix\n",
45+
"f_high_variance[0:3]"
46+
]
47+
},
48+
{
49+
"cell_type": "code",
50+
"execution_count": null,
51+
"metadata": {
52+
"colab": {
53+
"base_uri": "https://localhost:8080/"
54+
},
55+
"id": "7ZZgOg1-SpuX",
56+
"outputId": "a869adde-0b29-4630-9661-34377f110d4f"
57+
},
58+
"outputs": [],
59+
"source": [
60+
"# View variances\n",
61+
"thresholder.fit(features_i).variances_"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": null,
67+
"metadata": {
68+
"colab": {
69+
"base_uri": "https://localhost:8080/"
70+
},
71+
"id": "zYNK4wP5Sq9R",
72+
"outputId": "30e18ea5-4b63-43e5-819e-9a99251dfae6"
73+
},
74+
"outputs": [],
75+
"source": [
76+
"\n",
77+
"# feature matrix stantardization\n",
78+
"scaler = StandardScaler()\n",
79+
"f_std = scaler.fit_transform(features_i)\n",
80+
"# variance of each feature calculation\n",
81+
"selection = VarianceThreshold()\n",
82+
"selection.fit(f_std).variances_"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": null,
88+
"metadata": {
89+
"colab": {
90+
"base_uri": "https://localhost:8080/"
91+
},
92+
"id": "jDGMP97LSuiB",
93+
"outputId": "c1b9d537-495f-4109-ef75-324fe9943668"
94+
},
95+
"outputs": [],
96+
"source": [
97+
"# feature matrix creation with:\n",
98+
"# for Feature 0: 80% class 0\n",
99+
"# for Feature 1: 80% class 1\n",
100+
"# for Feature 2: 60% class 0, 40% class 1\n",
101+
"features_i = [[0, 2, 0],\n",
102+
"[0, 1, 1],\n",
103+
"[0, 1, 0],\n",
104+
"[0, 1, 1],\n",
105+
"[1, 0, 0]]\n",
106+
"# threshold by variance\n",
107+
"thresholding = VarianceThreshold(threshold=(.65 * (1 - .65)))\n",
108+
"thresholding.fit_transform(features_i)"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": null,
114+
"metadata": {
115+
"colab": {
116+
"base_uri": "https://localhost:8080/",
117+
"height": 198
118+
},
119+
"id": "JvnObeKXS6xm",
120+
"outputId": "19dac143-9407-4bb4-cc23-b19b06025617"
121+
},
122+
"outputs": [],
123+
"source": [
124+
"# Create feature matrix with two highly correlated features\n",
125+
"features_m = np.array([[1, 1, 1],\n",
126+
"[2, 2, 0],\n",
127+
"[3, 3, 1],\n",
128+
"[4, 4, 0],\n",
129+
"[5, 5, 1],\n",
130+
"[6, 6, 0],\n",
131+
"[7, 7, 1],\n",
132+
"[8, 7, 0],\n",
133+
"[9, 7, 1]])\n",
134+
"# Conversion of feature matrix\n",
135+
"dataframe = pd.DataFrame(features_m)\n",
136+
"# correlation matrix creation\n",
137+
"corr_m = dataframe.corr().abs()\n",
138+
"# upper triangle selection\n",
139+
"upper1 = corr_m.where(np.triu(np.ones(corr_m.shape),\n",
140+
"k=1).astype(np.bool))\n",
141+
"# For correlation greater than 0.85, Find index of feature columns\n",
142+
"droping = [col for col in upper1.columns if any(upper1[col] > 0.85)]\n",
143+
"# Drop features\n",
144+
"dataframe.drop(dataframe.columns[droping], axis=1).head(3)"
145+
]
146+
},
147+
{
148+
"cell_type": "code",
149+
"execution_count": null,
150+
"metadata": {
151+
"colab": {
152+
"base_uri": "https://localhost:8080/"
153+
},
154+
"id": "Dos1ZfkDS-Zd",
155+
"outputId": "17e96f0d-a55a-4943-90a9-99aa3c31fad3"
156+
},
157+
"outputs": [],
158+
"source": [
159+
"# Load data\n",
160+
"iris_i = load_iris()\n",
161+
"features_v = iris.data\n",
162+
"target = iris.target\n",
163+
"# categorical data coversion\n",
164+
"features_v = features_v.astype(int)\n",
165+
"# Selection of two features using highest chi-squared \n",
166+
"chi2_s = SelectKBest(chi2, k=2)\n",
167+
"f_kbest = chi2_s.fit_transform(features_v, target)\n",
168+
"# Show results\n",
169+
"print(\"Original number of features:\", features_v.shape[1])\n",
170+
"print(\"Reduced number of features:\", f_kbest.shape[1])"
171+
]
172+
},
173+
{
174+
"cell_type": "code",
175+
"execution_count": null,
176+
"metadata": {
177+
"colab": {
178+
"base_uri": "https://localhost:8080/"
179+
},
180+
"id": "y10u_gQbTCwR",
181+
"outputId": "651182ab-d857-4a3d-db61-4fff866d167c"
182+
},
183+
"outputs": [],
184+
"source": [
185+
"# Selection of two features using highest F-values\n",
186+
"f_selector = SelectKBest(f_classif, k=2)\n",
187+
"f_kbest = f_selector.fit_transform(features_v, target)\n",
188+
"# Pisplay results\n",
189+
"print(\"Original number of features:\", features_v.shape[1])\n",
190+
"print(\"Reduced number of features:\", f_kbest.shape[1])"
191+
]
192+
},
193+
{
194+
"cell_type": "code",
195+
"execution_count": null,
196+
"metadata": {
197+
"colab": {
198+
"base_uri": "https://localhost:8080/"
199+
},
200+
"id": "5NXAa6UKTHiu",
201+
"outputId": "c34866b2-c08c-4020-b14d-78deb98f2834"
202+
},
203+
"outputs": [],
204+
"source": [
205+
"# Selection of top 65% of features \n",
206+
"f_selector = SelectPercentile(f_classif, percentile=65)\n",
207+
"f_kbest = f_selector.fit_transform(features_v, target)\n",
208+
"# Display results\n",
209+
"print(\"Original number of features:\", features_v.shape[1])\n",
210+
"print(\"Reduced number of features:\", f_kbest.shape[1])"
211+
]
212+
},
213+
{
214+
"cell_type": "code",
215+
"execution_count": null,
216+
"metadata": {
217+
"colab": {
218+
"base_uri": "https://localhost:8080/"
219+
},
220+
"id": "39-Wq-F9TKVg",
221+
"outputId": "e52c0537-2245-4f12-ea9a-ace232984ec1"
222+
},
223+
"outputs": [],
224+
"source": [
225+
"# Load libraries\n",
226+
"# Suppress an annoying but harmless warning\n",
227+
"warnings.filterwarnings(action=\"ignore\", module=\"scipy\",\n",
228+
"message=\"^internal gelsd\")\n",
229+
"# features matrix, target vector, true coefficients\n",
230+
"features_f, target_t = make_regression(n_samples = 10000,\n",
231+
"n_features = 100,\n",
232+
"n_informative = 2,\n",
233+
"random_state = 1)\n",
234+
"# linear regression creation\n",
235+
"ols = linear_model.LinearRegression()\n",
236+
"# Recursive features elimination\n",
237+
"rfecv = RFECV(estimator=ols, step=2, scoring=\"neg_mean_squared_error\")\n",
238+
"rfecv.fit(features_f, target_t)\n",
239+
"rfecv.transform(features_f)"
240+
]
241+
},
242+
{
243+
"cell_type": "code",
244+
"execution_count": null,
245+
"metadata": {
246+
"colab": {
247+
"base_uri": "https://localhost:8080/"
248+
},
249+
"id": "Ut1mgIGEUhJM",
250+
"outputId": "f365a4d5-63f4-4a55-e828-d331e6f06308"
251+
},
252+
"outputs": [],
253+
"source": [
254+
"# Number of best features\n",
255+
"rfecv.n_features_"
256+
]
257+
},
258+
{
259+
"cell_type": "code",
260+
"execution_count": null,
261+
"metadata": {
262+
"colab": {
263+
"base_uri": "https://localhost:8080/"
264+
},
265+
"id": "Lpt7I_Q0UjN1",
266+
"outputId": "4d6938dc-d813-42a5-c1b7-9ba4865a0e86"
267+
},
268+
"outputs": [],
269+
"source": [
270+
"# What the best categories ?\n",
271+
"rfecv.support_"
272+
]
273+
},
274+
{
275+
"cell_type": "code",
276+
"execution_count": null,
277+
"metadata": {
278+
"colab": {
279+
"base_uri": "https://localhost:8080/"
280+
},
281+
"id": "ojYKsEbTUkMu",
282+
"outputId": "98652d92-f58f-41fe-9ba1-b1ecd3ef7ecb"
283+
},
284+
"outputs": [],
285+
"source": [
286+
"# We can even see how the features are ranked\n",
287+
"rfecv.ranking_"
288+
]
289+
}
290+
],
291+
"metadata": {
292+
"colab": {
293+
"name": "Untitled42.ipynb",
294+
"provenance": []
295+
},
296+
"kernelspec": {
297+
"display_name": "Python 3",
298+
"name": "python3"
299+
},
300+
"language_info": {
301+
"name": "python"
302+
}
303+
},
304+
"nbformat": 4,
305+
"nbformat_minor": 0
306+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# -*- coding: utf-8 -*-
2+
"""Untitled42.ipynb
3+
4+
Automatically generated by Colaboratory.
5+
6+
Original file is located at
7+
https://colab.research.google.com/drive/1pE7KFZcxTLcnAuXUXcKSfsfskiMMPVQd
8+
"""
9+
10+
from sklearn.feature_selection import VarianceThreshold
11+
from sklearn.feature_selection import VarianceThreshold
12+
# Load libraries
13+
import pandas as pd
14+
import numpy as np
15+
# Load libraries
16+
from sklearn.datasets import load_iris
17+
from sklearn.feature_selection import SelectKBest
18+
from sklearn.feature_selection import chi2, f_classif
19+
from sklearn.feature_selection import SelectPercentile
20+
from sklearn.preprocessing import StandardScaler
21+
import warnings
22+
from sklearn.datasets import make_regression
23+
from sklearn.feature_selection import RFECV
24+
from sklearn import datasets, linear_model
25+
26+
# Load libraries
27+
# import data
28+
iris = datasets.load_iris()
29+
# Create features and target
30+
features_i = iris.data
31+
target_i = iris.target
32+
# thresholder creation
33+
thresholder = VarianceThreshold(threshold=.4)
34+
# high variance feature matrix creation
35+
f_high_variance = thresholder.fit_transform(features_i)
36+
# View high variance feature matrix
37+
f_high_variance[0:3]
38+
39+
# View variances
40+
thresholder.fit(features_i).variances_
41+
42+
# feature matrix stantardization
43+
scaler = StandardScaler()
44+
f_std = scaler.fit_transform(features_i)
45+
# variance of each feature calculation
46+
selection = VarianceThreshold()
47+
selection.fit(f_std).variances_
48+
49+
# feature matrix creation with:
50+
# for Feature 0: 80% class 0
51+
# for Feature 1: 80% class 1
52+
# for Feature 2: 60% class 0, 40% class 1
53+
features_i = [[0, 2, 0],
54+
[0, 1, 1],
55+
[0, 1, 0],
56+
[0, 1, 1],
57+
[1, 0, 0]]
58+
# threshold by variance
59+
thresholding = VarianceThreshold(threshold=(.65 * (1 - .65)))
60+
thresholding.fit_transform(features_i)
61+
62+
# Create feature matrix with two highly correlated features
63+
features_m = np.array([[1, 1, 1],
64+
[2, 2, 0],
65+
[3, 3, 1],
66+
[4, 4, 0],
67+
[5, 5, 1],
68+
[6, 6, 0],
69+
[7, 7, 1],
70+
[8, 7, 0],
71+
[9, 7, 1]])
72+
# Conversion of feature matrix
73+
dataframe = pd.DataFrame(features_m)
74+
# correlation matrix creation
75+
corr_m = dataframe.corr().abs()
76+
# upper triangle selection
77+
upper1 = corr_m.where(np.triu(np.ones(corr_m.shape),
78+
k=1).astype(np.bool))
79+
# For correlation greater than 0.85, Find index of feature columns
80+
droping = [col for col in upper1.columns if any(upper1[col] > 0.85)]
81+
# Drop features
82+
dataframe.drop(dataframe.columns[droping], axis=1).head(3)
83+
84+
# Load data
85+
iris_i = load_iris()
86+
features_v = iris.data
87+
target = iris.target
88+
# categorical data coversion
89+
features_v = features_v.astype(int)
90+
# Selection of two features using highest chi-squared
91+
chi2_s = SelectKBest(chi2, k=2)
92+
f_kbest = chi2_s.fit_transform(features_v, target)
93+
# Show results
94+
print("Original number of features:", features_v.shape[1])
95+
print("Reduced number of features:", f_kbest.shape[1])
96+
97+
# Selection of two features using highest F-values
98+
f_selector = SelectKBest(f_classif, k=2)
99+
f_kbest = f_selector.fit_transform(features_v, target)
100+
# Pisplay results
101+
print("Original number of features:", features_v.shape[1])
102+
print("Reduced number of features:", f_kbest.shape[1])
103+
104+
# Selection of top 65% of features
105+
f_selector = SelectPercentile(f_classif, percentile=65)
106+
f_kbest = f_selector.fit_transform(features_v, target)
107+
# Display results
108+
print("Original number of features:", features_v.shape[1])
109+
print("Reduced number of features:", f_kbest.shape[1])
110+
111+
# Load libraries
112+
# Suppress an annoying but harmless warning
113+
warnings.filterwarnings(action="ignore", module="scipy",
114+
message="^internal gelsd")
115+
# features matrix, target vector, true coefficients
116+
features_f, target_t = make_regression(n_samples = 10000,
117+
n_features = 100,
118+
n_informative = 2,
119+
random_state = 1)
120+
# linear regression creation
121+
ols = linear_model.LinearRegression()
122+
# Recursive features elimination
123+
rfecv = RFECV(estimator=ols, step=2, scoring="neg_mean_squared_error")
124+
rfecv.fit(features_f, target_t)
125+
rfecv.transform(features_f)
126+
127+
# Number of best features
128+
rfecv.n_features_
129+
130+
# What the best categories ?
131+
rfecv.support_
132+
133+
# We can even see how the features are ranked
134+
rfecv.ranking_
135+
136+
help(SelectKBest)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
numpy
2+
pandas
3+
sklearn

0 commit comments

Comments
 (0)
Please sign in to comment.