In: Computer Science
I am writing this machine learning code (classification) to clssify between two classes. I started by having one feature to capture for all my images.
for example:
class A=[(4295046.0, 1), (4998220.0, 1), (4565017.0, 1), (4078291.0, 1), (4350411.0, 1), (4434050.0, 1), (4201831.0, 1), (4203570.0, 1), (4197025.0, 1), (4110781.0, 1), (4080568.0, 1), (4276499.0, 1), (4363551.0, 1), (4241573.0, 1), (4455070.0, 1), (5682823.0, 1), (5572122.0, 1), (5382890.0, 1), (5217487.0, 1), (4714908.0, 1), (4697137.0, 1), (4737784.0, 1), (4648881.0, 1), (4591211.0, 1), (4750706.0, 1), (5067788.0, 1), (7392115.0, 1), (7024501.0, 1), (6590118.0, 1), (6260326.0, 1), (6001223.0, 1), (5513267.0, 1), (5684732.0, 1), (4092011.0, 1), (6634798.0, 1), (6885369.0, 1), (2854799.0, 1), (2642866.0, 1), (2591293.0, 1), (2345370.0, 1), (2353085.0, 1), (2598480.0, 1), (3996284.0, 1), (7536032.0, 1), (7338023.0, 1), (7561037.0, 1), (7529364.0, 1), (7577504.0, 1), (7353176.0, 1), (4057898.0, 1), (4143981.0, 1), (3899129.0, 1), (3830584.0, 1), (3557377.0, 1), (3125518.0, 1), (3197039.0, 1), (3109404.0, 1), (3024219.0, 1), (3066759.0, 1), (2726363.0, 1), (3507626.0, 1), (2531828.0, 1), (2330385.0, 1), (2317570.0, 1), (2444669.0, 1), (2513998.0, 1), (2624739.0, 1), (3555578.0, 1), (2582228.0, 1), (4404128.0, 1), (4307425.0, 1), (4188310.0, 1), (2460042.0, 1), (4387062.0, 1), (2162785.0, 1), (2168945.0, 1), (2304868.0, 1), (2437261.0, 1), (3557410.0, 1), (3830618.0, 1), (3550021.0, 1), (3588758.0, 1), (3447567.0, 1), (3559924.0, 1), (3284499.0, 1), (3595260.0, 1), (4494963.0, 1), (4294039.0, 1), (3849395.0, 1), (3620279.0, 1), (3406951.0, 1), (3578885.0, 1), (3763810.0, 1), (3820821.0, 1)]
class B=[(7179088.0, 0), (7144249.0, 0), (6806806.0, 0), (5080876.0, 0), (5170390.0, 0), (5694876.0, 0), (6210510.0, 0), (5376014.0, 0), (6472171.0, 0), (7112956.0, 0), (7356507.0, 0), (7418046.0, 0), (7975884.0, 0), (7862043.0, 0), (7627016.0, 0), (7778397.0, 0), (7175463.0, 0), (7347721.0, 0), (5646602.0, 0), (5357049.0, 0), (6435755.0, 0), (7254820.0, 0), (7509701.0, 0), (7588029.0, 0), (7491507.0, 0), (7505240.0, 0), (7650181.0, 0), (7574974.0, 0), (7579726.0, 0), (7444229.0, 0), (3777032.0, 0), (7379626.0, 0), (7184128.0, 0), (7320911.0, 0), (7425228.0, 0), (7489048.0, 0), (7145778.0, 0), (7754034.0, 0), (8635490.0, 0), (8798277.0, 0), (8067185.0, 0), (8205319.0, 0), (8908959.0, 0), (9153481.0, 0), (9180030.0, 0), (9183460.0, 0), (9212517.0, 0), (9055663.0, 0), (9053709.0, 0), (9103067.0, 0), (8889903.0, 0), (8328604.0, 0), (8475442.0, 0), (8499221.0, 0), (8752169.0, 0), (8779133.0, 0), (8756789.0, 0), (8990732.0, 0), (9027381.0, 0), (9090035.0, 0), (9343846.0, 0), (9518609.0, 0), (9435149.0, 0), (9365842.0, 0), (9395256.0, 0), (4381880.0, 0), (4749338.0, 0), (5296143.0, 0), (5478942.0, 0), (5610865.0, 0), (5514997.0, 0), (5381010.0, 0), (5090416.0, 0), (4663958.0, 0), (4804526.0, 0), (4743107.0, 0), (4898914.0, 0), (5018503.0, 0), (5778240.0, 0), (5741893.0, 0), (4632926.0, 0), (5208486.0, 0), (5633403.0, 0), (5699410.0, 0), (5748260.0, 0), (5869260.0, 0), (5589575.0, 0), (5627535.0, 0), (5551501.0, 0), (5467609.0, 0), (5513782.0, 0), (5491950.0, 0), (5887072.0, 0), (6419620.0, 0), (6625864.0, 0), (6645778.0, 0), (6580741.0, 0), (6152337.0, 0), (5991092.0, 0), (5847561.0, 0), (5718127.0, 0), (5971544.0, 0), (6031962.0, 0), (5873358.0, 0), (6135263.0, 0), (2886886.0, 0), (3855637.0, 0), (7817578.0, 0), (3747685.0, 0), (7886519.0, 0), (8277473.0, 0), (8284216.0, 0), (8284850.0, 0), (7753420.0, 0), (7825824.0, 0), (3808486.0, 0), (3809493.0, 0), (3808122.0, 0), (3637373.0, 0), (3556258.0, 0), (3487921.0, 0), (3475961.0, 0), (3468375.0, 0), (3410898.0, 0), (3965656.0, 0), (4175368.0, 0), (4602949.0, 0), (4718392.0, 0), (4876949.0, 0), (5129132.0, 0), (5110047.0, 0), (5099632.0, 0), (4935172.0, 0), (4303854.0, 0)]
rest of my code:
//data is A and B combined
x = [[each[0]] for each in data]
y = [[each[1]] for each in data]
print (len(x), len(y))
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=0.2, random_state=42)
print (len(x_train), len(x_test))
print (len(y_train), len(y_test))
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=2,
random_state=0)
clf.fit(x_train, y_train)
Question:
what to change to add another feature? how the A and B should look like and how the classifier should look like? [Python]
#CODE
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
A=[(4295046.0, 1), (4998220.0, 1), (4565017.0, 1), (4078291.0,
1), (4350411.0, 1), (4434050.0, 1), (4201831.0, 1), (4203570.0, 1),
(4197025.0, 1), (4110781.0, 1), (4080568.0, 1), (4276499.0, 1),
(4363551.0, 1), (4241573.0, 1), (4455070.0, 1), (5682823.0, 1),
(5572122.0, 1), (5382890.0, 1), (5217487.0, 1), (4714908.0, 1),
(4697137.0, 1), (4737784.0, 1), (4648881.0, 1), (4591211.0, 1),
(4750706.0, 1), (5067788.0, 1), (7392115.0, 1), (7024501.0, 1),
(6590118.0, 1), (6260326.0, 1), (6001223.0, 1), (5513267.0, 1),
(5684732.0, 1), (4092011.0, 1), (6634798.0, 1), (6885369.0, 1),
(2854799.0, 1), (2642866.0, 1), (2591293.0, 1), (2345370.0, 1),
(2353085.0, 1), (2598480.0, 1), (3996284.0, 1), (7536032.0, 1),
(7338023.0, 1), (7561037.0, 1), (7529364.0, 1), (7577504.0, 1),
(7353176.0, 1), (4057898.0, 1), (4143981.0, 1), (3899129.0, 1),
(3830584.0, 1), (3557377.0, 1), (3125518.0, 1), (3197039.0, 1),
(3109404.0, 1), (3024219.0, 1), (3066759.0, 1), (2726363.0, 1),
(3507626.0, 1), (2531828.0, 1), (2330385.0, 1), (2317570.0, 1),
(2444669.0, 1), (2513998.0, 1), (2624739.0, 1), (3555578.0, 1),
(2582228.0, 1), (4404128.0, 1), (4307425.0, 1), (4188310.0, 1),
(2460042.0, 1), (4387062.0, 1), (2162785.0, 1), (2168945.0, 1),
(2304868.0, 1), (2437261.0, 1), (3557410.0, 1), (3830618.0, 1),
(3550021.0, 1), (3588758.0, 1), (3447567.0, 1), (3559924.0, 1),
(3284499.0, 1), (3595260.0, 1), (4494963.0, 1), (4294039.0, 1),
(3849395.0, 1), (3620279.0, 1), (3406951.0, 1), (3578885.0, 1),
(3763810.0, 1), (3820821.0, 1)]
B=[(7179088.0, 0), (7144249.0, 0), (6806806.0, 0), (5080876.0, 0),
(5170390.0, 0), (5694876.0, 0), (6210510.0, 0), (5376014.0, 0),
(6472171.0, 0), (7112956.0, 0), (7356507.0, 0), (7418046.0, 0),
(7975884.0, 0), (7862043.0, 0), (7627016.0, 0), (7778397.0, 0),
(7175463.0, 0), (7347721.0, 0), (5646602.0, 0), (5357049.0, 0),
(6435755.0, 0), (7254820.0, 0), (7509701.0, 0), (7588029.0, 0),
(7491507.0, 0), (7505240.0, 0), (7650181.0, 0), (7574974.0, 0),
(7579726.0, 0), (7444229.0, 0), (3777032.0, 0), (7379626.0, 0),
(7184128.0, 0), (7320911.0, 0), (7425228.0, 0), (7489048.0, 0),
(7145778.0, 0), (7754034.0, 0), (8635490.0, 0), (8798277.0, 0),
(8067185.0, 0), (8205319.0, 0), (8908959.0, 0), (9153481.0, 0),
(9180030.0, 0), (9183460.0, 0), (9212517.0, 0), (9055663.0, 0),
(9053709.0, 0), (9103067.0, 0), (8889903.0, 0), (8328604.0, 0),
(8475442.0, 0), (8499221.0, 0), (8752169.0, 0), (8779133.0, 0),
(8756789.0, 0), (8990732.0, 0), (9027381.0, 0), (9090035.0, 0),
(9343846.0, 0), (9518609.0, 0), (9435149.0, 0), (9365842.0, 0),
(9395256.0, 0), (4381880.0, 0), (4749338.0, 0), (5296143.0, 0),
(5478942.0, 0), (5610865.0, 0), (5514997.0, 0), (5381010.0, 0),
(5090416.0, 0), (4663958.0, 0), (4804526.0, 0), (4743107.0, 0),
(4898914.0, 0), (5018503.0, 0), (5778240.0, 0), (5741893.0, 0),
(4632926.0, 0), (5208486.0, 0), (5633403.0, 0), (5699410.0, 0),
(5748260.0, 0), (5869260.0, 0), (5589575.0, 0), (5627535.0, 0),
(5551501.0, 0), (5467609.0, 0), (5513782.0, 0), (5491950.0, 0),
(5887072.0, 0), (6419620.0, 0), (6625864.0, 0), (6645778.0, 0),
(6580741.0, 0), (6152337.0, 0), (5991092.0, 0), (5847561.0, 0),
(5718127.0, 0), (5971544.0, 0), (6031962.0, 0), (5873358.0, 0),
(6135263.0, 0), (2886886.0, 0), (3855637.0, 0), (7817578.0, 0),
(3747685.0, 0), (7886519.0, 0), (8277473.0, 0), (8284216.0, 0),
(8284850.0, 0), (7753420.0, 0), (7825824.0, 0), (3808486.0, 0),
(3809493.0, 0), (3808122.0, 0), (3637373.0, 0), (3556258.0, 0),
(3487921.0, 0), (3475961.0, 0), (3468375.0, 0), (3410898.0, 0),
(3965656.0, 0), (4175368.0, 0), (4602949.0, 0), (4718392.0, 0),
(4876949.0, 0), (5129132.0, 0), (5110047.0, 0), (5099632.0, 0),
(4935172.0, 0), (4303854.0, 0)]
data=[*A,*B]
x = [[each[0]] for each in data]
y = [[each[1]] for each in data]
print (len(x), len(y))
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=0.2, random_state=42)
print (len(x_train), len(x_test))
print (len(y_train), len(y_test))
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=2,
random_state=0)
clf.fit(x_train, y_train)
clf.score(x_test,y_test)
#convert feature 1 from list to int
temp=[]
for ele in x:
temp.extend(ele)
x=np.array(temp)
y=np.array(y)
#To add new features to data using pandas DataFrame makes the task
easier for us
df=pd.DataFrame(x)
df['y']=y
df.columns=['f1','y']
df.head()
#Adding new feature
df['f2']=df['f1']*3
df.head()