1 random_data = np.random.random((20 ,2 ))
array([[0.50296664, 0.62445093],
[0.19994622, 0.19373156],
[0.14250226, 0.61334931],
[0.70048398, 0.75160392],
[0.74448897, 0.7320275 ],
[0.85976709, 0.84319014],
[0.73253413, 0.12288743],
[0.88371578, 0.84136921],
[0.97180754, 0.79078425],
[0.05776667, 0.92731363],
[0.03322522, 0.36021126],
[0.30821425, 0.57943347],
[0.16970345, 0.72740845],
[0.5127129 , 0.44245729],
[0.37546157, 0.477542 ],
[0.96229987, 0.98167783],
[0.79974288, 0.20093964],
[0.65953995, 0.83289056],
[0.83061176, 0.10022954],
[0.57372662, 0.74422547]])
1 import matplotlib.pyplot as plt
1 2 3 4 5 6 def assuming_function (x) : return 13.4 *x + 5 + random.randint(-5 ,5 )
1 y = [assuming_function(x) for x in X]
<matplotlib.collections.PathCollection at 0x21e39725708>
Regression -> Real Number Classification -> [0,0,0,1],[0,1,0,0]
array([ 9.73975302, 6.67927941, 3.90953023, 10.38648538, 13.97615214,
20.520879 , 10.81595738, 14.84179151, 19.02222101, 8.77407336,
0.44521789, 13.13007093, 4.27402617, 7.87035283, 9.03118504,
13.89481826, 18.71655464, 15.83783527, 16.1301976 , 10.68793665])
1 2 import numpy as pyfrom sklearn.linear_model import LinearRegression
1 reg = LinearRegression().fit(X.reshape(-1 ,1 ),y)
1 reg.score(X.reshape(-1 ,1 ),y)
0.7115853500059341
array([14.50455278])
3.441324165071637
1 2 def f (x) : return reg.coef_ * x + reg.intercept_
1 2 plt.scatter(X,y) plt.plot(X,f(X),color='red' )
[<matplotlib.lines.Line2D at 0x21e3cf5c8c8>]
array([0.50296664, 0.19994622, 0.14250226, 0.70048398, 0.74448897,
0.85976709, 0.73253413, 0.88371578, 0.97180754, 0.05776667,
0.03322522, 0.30821425, 0.16970345, 0.5127129 , 0.37546157,
0.96229987, 0.79974288, 0.65953995, 0.83061176, 0.57372662])
How to implement a KNN model 1 2 3 def model (X,y) : return [(Xi,yi) for Xi, yi in zip(X,y)]
1 from scipy.spatial.distance import cosine
1 2 def distance (x1,x2) : return cosine(x1,x2)
1 2 3 4 5 6 7 8 9 def prdict (x,k=5 ) : most_similars =sorted(model(X,y),key = lambda xi:distance(xi[0 ],x))[:k]
不是简简单单的学一个算法,看到背后的思维方式:
贝叶斯,线性回归,决策树,KNN
新的问题,是不存在现成的解决方案的,但是,我们可以依据前人比较成熟的思维方法,我们发明新的方法。 How to implement a Decision Tree 1 from collections import Counter
信息熵 Gini 纯度
Collecting icecream
Downloading https://files.pythonhosted.org/packages/8c/ec/821ef939e8e4f4306e7263afa7e2ce0b4c5da9e6e53d1cc97b01606035f8/icecream-2.0.0-py2.py3-none-any.whl
Requirement already satisfied: colorama>=0.3.9 in d:\anaconda\lib\site-packages (from icecream) (0.4.1)
Collecting asttokens>=2.0.1 (from icecream)
Downloading https://files.pythonhosted.org/packages/e8/18/41e95b4a6b4fd3ae704e672da5d070272518995f580be79d772be312c4af/asttokens-2.0.3-py2.py3-none-any.whl
Requirement already satisfied: pygments>=2.2.0 in d:\anaconda\lib\site-packages (from icecream) (2.4.2)
Collecting executing>=0.3.1 (from icecream)
Downloading https://files.pythonhosted.org/packages/79/a1/f85482473b12b2b0e1fa10da84d4280930dbd6e4e149cedf7ae91f894138/executing-0.4.1.tar.gz
Requirement already satisfied: six in d:\anaconda\lib\site-packages (from asttokens>=2.0.1->icecream) (1.12.0)
Building wheels for collected packages: executing
Building wheel for executing (setup.py): started
Building wheel for executing (setup.py): finished with status 'done'
Created wheel for executing: filename=executing-0.4.1-cp37-none-any.whl size=8302 sha256=2fce6277eb7197756482de660a24f6d2c80bb965838cbbb515106f385e4ccad3
Stored in directory: C:\Users\tb\AppData\Local\pip\Cache\wheels\b0\71\dc\c1bdcd4b384c4458b639dfa905bc093979b8779f2e0df78792
Successfully built executing
Installing collected packages: asttokens, executing, icecream
Successfully installed asttokens-2.0.3 executing-0.4.1 icecream-2.0.0
1 2 3 4 5 6 def entropy (elements) : '''群体混乱程度''' counter = Counter(elements) probs = [counter[c] / len(elements) for c in set(elements)] ic(probs) return - sum(p * np.log(p) for p in probs)
ic| probs: [1.0]
-0.0
ic| probs: [0.25, 0.75]
0.5623351446188083
ic| probs: [0.25, 0.75]
0.5623351446188083
ic| probs: [0.25, 0.5, 0.25]
1.0397207708399179
ic| probs: [0.25, 0.5, 0.25]
1.0397207708399179
ic| probs: [0.25, 0.25, 0.25, 0.25]
1.3862943611198906
决策树怎么来决定,哪一个特征来进行分割呢? 1 2 3 4 5 6 7 mock_data = { 'gender' :['F' ,'F' ,'F' ,'F' ,'M' ,'M' ,'M' ], 'income' :['+10' ,'-10' ,'+10' ,'+10' ,'+10' ,'+10' ,'-10' ], 'family_number' :[1 ,1 ,2 ,1 ,1 ,1 ,2 ], 'bought' :[1 ,1 ,1 ,0 ,0 ,0 ,1 ], }
1 dataset = pd.DataFrame.from_dict(mock_data)
gender
income
family_number
bought
0
F
+10
1
1
1
F
-10
1
1
2
F
+10
2
1
3
F
+10
1
0
4
M
+10
1
0
5
M
+10
1
0
6
M
-10
2
1
如果我们来了一个新的case:::[F,-10,2,1]->?
:::[F,+10,2,0]->?
1 2 3 4 5 6 7 8 9 10 11 print(entropy([1 ,1 ,1 ,0 ]) + entropy([0 ,0 ,1 ])) print(entropy([1 ,1 ,0 ,0 ,0 ]) + entropy([1 ,1 ])) print(entropy([1 ,1 ,0 ,0 ,0 ])+entropy([1 ,1 ])) print(entropy([1 ,1 ,1 ,1 ])+entropy([0 ,0 ,0 ]))
ic| probs: [0.25, 0.75]
ic| probs: [0.6666666666666666, 0.3333333333333333]
ic| probs: [0.6, 0.4]
ic| probs: [1.0]
ic| probs: [0.6, 0.4]
ic| probs: [1.0]
ic| probs: [1.0]
ic| probs: [1.0]
1.198849312913621
0.6730116670092565
0.6730116670092565
-0.0
决策树在选择决策过程,决策顺序的时候,其实是按照,根据这个特征,进行分割之后,数据的熵最少原则进行的。 1 set(mock_data['family_number' ])
{1, 2}
1 set(mock_data['gender' ])
{'F', 'M'}
1 sub_split_1 = dataset[dataset['family_number' ]==1 ]['bought' ].tolist()
[1, 1, 0, 0, 0]
1 sub_split_2 =dataset[dataset['family_number' ]!=1 ]['bought' ].tolist()
[1, 1]
1 splited_data =dataset[dataset['family_number' ]==1 ]
gender
income
family_number
pet
bought
0
F
+10
1
1
1
1
F
-10
1
1
1
3
F
+10
1
0
0
4
M
+10
1
0
0
5
M
+10
1
0
0
1 splited_data[splited_data['income' ] == '+10' ]
gender
income
family_number
pet
bought
0
F
+10
1
1
1
3
F
+10
1
0
0
4
M
+10
1
0
0
5
M
+10
1
0
0
===> 根据信息熵,我们得到了一个决策过程: 1 2 3 4 第一步:我们观察他的家庭成员: 如果他的家庭成员是2人,那么就会购买,如果不是2人,我们继续决策,进入下一步 第二部:我们观察他的收入情况: 如果他的收入是'+10',那么他有 3/4 的概率会购买,如果是'-10',那么,他肯定不买
ic| probs: [0.6, 0.4]
0.6730116670092565
ic| probs: [1.0]
-0.0
1 set(dataset.columns.to_list()) - {'bought' }
{'family_number', 'gender', 'income', 'pet'}
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 def find_the_optimal_spilter (training_data:pd.DataFrame,target:str) -> str: x_fields = set(training_data.columns.tolist()) - {target} spliter = None min_entropy = float('inf' ) for f in x_fields: ic(f) values = set(training_data[f]) ic(values) for v in values: sub_spliter_1 = training_data[training_data[f] == v][target].tolist() ic(sub_spliter_1) entropy_1 =entropy(sub_spliter_1) ic(entropy_1) sub_spliter_2 = training_data[training_data[f] !=v][target].tolist() ic(sub_spliter_2) entropy_2 = entropy(sub_spliter_2) ic(entropy_2) entropy_v =entropy_1 +entropy_2 ic(entropy_v) if entropy_v <= min_entropy: min_entropy = entropy_v spliter =(f,v) print(' spliter is: {}' .format(spliter)) print(' the min entropy is: {}' .format(min_entropy)) return spliter
1 find_the_optimal_spilter(training_data=dataset,target='bought' )
ic| f: 'pet'
ic| values: {0, 1}
ic| sub_spliter_1: [0, 0, 0]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 1, 1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: -0.0
ic| sub_spliter_1: [1, 1, 1, 1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [0, 0, 0]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: -0.0
ic| f: 'family_number'
ic| values: {1, 2}
ic| sub_spliter_1: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_1: 0.6730116670092565
ic| sub_spliter_2: [1, 1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6730116670092565
ic| sub_spliter_1: [1, 1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_2: 0.6730116670092565
ic| entropy_v: 0.6730116670092565
ic| f: 'income'
ic| values: {'-10', '+10'}
ic| sub_spliter_1: [1, 1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_2: 0.6730116670092565
ic| entropy_v: 0.6730116670092565
ic| sub_spliter_1: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_1: 0.6730116670092565
ic| sub_spliter_2: [1, 1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6730116670092565
ic| f: 'gender'
ic| values: {'F', 'M'}
ic| sub_spliter_1: [1, 1, 1, 0]
ic| probs: [0.25, 0.75]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: [0, 0, 1]
ic| probs: [0.6666666666666666, 0.3333333333333333]
ic| entropy_2: 0.6365141682948128
ic| entropy_v: 1.198849312913621
ic| sub_spliter_1: [0, 0, 1]
ic| probs: [0.6666666666666666, 0.3333333333333333]
ic| entropy_1: 0.6365141682948128
ic| sub_spliter_2: [1, 1, 1, 0]
ic| probs: [0.25, 0.75]
ic| entropy_2: 0.5623351446188083
ic| entropy_v: 1.198849312913621
spliter is: ('pet', 1)
the min entropy is: -0.0
('pet', 1)
1 dataset[dataset['family_number' ] ==2 ]
gender
income
family_number
pet
bought
2
F
+10
2
1
1
6
M
-10
2
1
1
1 dataset[dataset['family_number' ] ==1 ]
gender
income
family_number
pet
bought
0
F
+10
1
1
1
1
F
-10
1
1
1
3
F
+10
1
0
0
4
M
+10
1
0
0
5
M
+10
1
0
0
1 find_the_optimal_spilter(dataset[dataset['family_number' ] ==1 ],'bought' )
ic| f: 'family_number'
ic| values: {1}
ic| sub_spliter_1: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_1: 0.6730116670092565
ic| sub_spliter_2: []
ic| probs: []
ic| entropy_2: 0
ic| entropy_v: 0.6730116670092565
ic| f: 'income'
ic| values: {'-10', '+10'}
ic| sub_spliter_1: [1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_2: 0.5623351446188083
ic| entropy_v: 0.5623351446188083
ic| sub_spliter_1: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: [1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.5623351446188083
ic| f: 'gender'
ic| values: {'F', 'M'}
ic| sub_spliter_1: [1, 1, 0]
ic| probs: [0.3333333333333333, 0.6666666666666666]
ic| entropy_1: 0.6365141682948128
ic| sub_spliter_2: [0, 0]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6365141682948128
ic| sub_spliter_1: [0, 0]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 0]
ic| probs: [0.3333333333333333, 0.6666666666666666]
ic| entropy_2: 0.6365141682948128
ic| entropy_v: 0.6365141682948128
spliter is: ('income', '+10')
the min entropy is: 0.5623351446188083
('income', '+10')
1 fm_n_1 = dataset[dataset['family_number' ] == 1 ]
1 fm_n_1[fm_n_1['income' ]=='+10' ]
gender
income
family_number
bought
0
F
+10
1
1
3
F
+10
1
0
4
M
+10
1
0
5
M
+10
1
0
1 find_the_optimal_spilter(fm_n_1[fm_n_1['income' ] == '+10' ],'bought' )
ic| f: 'family_number'
ic| values: {1}
ic| sub_spliter_1: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: []
ic| probs: []
ic| entropy_2: 0
ic| entropy_v: 0.5623351446188083
ic| f: 'income'
ic| values: {'+10'}
ic| sub_spliter_1: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: []
ic| probs: []
ic| entropy_2: 0
ic| entropy_v: 0.5623351446188083
ic| f: 'gender'
ic| values: {'F', 'M'}
ic| sub_spliter_1: [1, 0]
ic| probs: [0.5, 0.5]
ic| entropy_1: 0.6931471805599453
ic| sub_spliter_2: [0, 0]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6931471805599453
ic| sub_spliter_1: [0, 0]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 0]
ic| probs: [0.5, 0.5]
ic| entropy_2: 0.6931471805599453
ic| entropy_v: 0.6931471805599453
spliter is: ('income', '+10')
the min entropy is: 0.5623351446188083
('income', '+10')
Evaluation Methods 1.Accuracy 2. Precision 3.Recall 3.5 F1 Score, F2 Score 4. AUC 判断是不是垃圾邮件,是的话程序输出1,不是输出0 给了十个数据,这10个数据的真实情况是: ->[1,1,1,1,0,1,0,0,1,1] F(x) ->[1,1,1,1,1,1,1,1,0,1] Accuracy:预测的标签中预测正确的值的个数/总的预测的个数 —> 6/10
Precision: 所有说“是”的预测而且预测正确/所有说“是”的预测个数 —> 6/9
Recall:所有说“是”的预测而且预测正确 / 所有真正标签是“是” —> 6/7
Recall 和 Precision 其实往往是互相 tradeoff F1 Score = $ \frac{2precision recall}{precision + recall} $ —>AOC / AUC 到这一步就无法分割了 A simple example of kmeans 1 from sklearn.cluster import KMeans
1 2 X1 = [random.randint(0 ,100 ) for _ in range(100 )] X2 = [random.randint(0 ,100 ) for _ in range(100 )]
<matplotlib.collections.PathCollection at 0x21e43481048>
1 training_data = [[x1,x2] for x1,x2 in zip(X1,X2)]
1 cluster = KMeans(n_clusters=6 ,max_iter=500 )
1 cluster.fit(training_data)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=500,
n_clusters=6, n_init=10, n_jobs=None, precompute_distances='auto',
random_state=None, tol=0.0001, verbose=0)
1 cluster.cluster_centers_
array([[85.5 , 41.78571429],
[27.57894737, 18.89473684],
[79.90909091, 84.22727273],
[14.6 , 76.6 ],
[51.06666667, 65.33333333],
[62.73333333, 22.13333333]])
array([1, 3, 2, 5, 0, 1, 5, 0, 1, 2, 4, 3, 3, 4, 4, 0, 1, 0, 5, 0, 1, 4,
1, 3, 4, 3, 5, 1, 1, 4, 2, 2, 3, 4, 2, 2, 5, 3, 2, 1, 4, 0, 3, 5,
2, 4, 3, 2, 1, 3, 4, 1, 2, 5, 0, 5, 5, 0, 0, 2, 2, 1, 2, 2, 0, 0,
4, 1, 5, 3, 5, 0, 2, 5, 3, 3, 5, 5, 5, 2, 1, 2, 2, 4, 1, 2, 4, 4,
2, 1, 0, 1, 2, 3, 1, 2, 0, 1, 3, 4])
1 from collections import defaultdict
1 centers = defaultdict(list)
1 2 for label, location in zip(cluster.labels_,training_data): centers[label].append(location)
1 2 3 4 5 6 7 8 color = ['red' ,'green' ,'grey' ,'black' ,'yellow' ,'orange' ] for i,c in enumerate(centers): for location in centers[c]: plt.scatter(*location,c=color[i]) for center in cluster.cluster_centers_: plt.scatter(*center,s=100 )
Kmeans 的计算复杂度 1 def distance () : return np.sqrt((x1 - x2)**2 +(y1 - y2)**2 )
N:10000 k:100 d:500 I:500 ->10**(5+2+2+2)=>10^11 ==>100亿