Machinglearing Model,the first step

1	%matplotlib inline

1	import numpy as np

1	random_data = np.random.random((20,2))

1	random_data

array([[0.50296664, 0.62445093],
       [0.19994622, 0.19373156],
       [0.14250226, 0.61334931],
       [0.70048398, 0.75160392],
       [0.74448897, 0.7320275 ],
       [0.85976709, 0.84319014],
       [0.73253413, 0.12288743],
       [0.88371578, 0.84136921],
       [0.97180754, 0.79078425],
       [0.05776667, 0.92731363],
       [0.03322522, 0.36021126],
       [0.30821425, 0.57943347],
       [0.16970345, 0.72740845],
       [0.5127129 , 0.44245729],
       [0.37546157, 0.477542  ],
       [0.96229987, 0.98167783],
       [0.79974288, 0.20093964],
       [0.65953995, 0.83289056],
       [0.83061176, 0.10022954],
       [0.57372662, 0.74422547]])

1	import matplotlib.pyplot as plt

1	X = random_data[:,0]

1	Y = random_data[:,1]

1	import random

def assuming_function(x):
    # 在我们的日常生活中是常见的
    # 体重-> 高血压的概率
    # 收入-> 买阿玛尼的概率
    # 其实都是一种潜在的函数关系 + 一个随机变化
    return 13.4 *x + 5 + random.randint(-5,5)

1	y = [assuming_function(x) for x in X]

1	plt.scatter(X,y)

<matplotlib.collections.PathCollection at 0x21e39725708>

1	y = np.array(y)

Regression -> Real Number

Classification -> [0,0,0,1],[0,1,0,0]

array([ 9.73975302,  6.67927941,  3.90953023, 10.38648538, 13.97615214,
       20.520879  , 10.81595738, 14.84179151, 19.02222101,  8.77407336,
        0.44521789, 13.13007093,  4.27402617,  7.87035283,  9.03118504,
       13.89481826, 18.71655464, 15.83783527, 16.1301976 , 10.68793665])

1 2	import numpy as py from sklearn.linear_model import LinearRegression

1	reg = LinearRegression().fit(X.reshape(-1,1),y)

1	reg.score(X.reshape(-1,1),y)

0.7115853500059341

reg.coef_

array([14.50455278])

1	reg.intercept_

3.441324165071637

1 2	def f(x): return reg.coef_ * x + reg.intercept_

1 2	plt.scatter(X,y) plt.plot(X,f(X),color='red')

[<matplotlib.lines.Line2D at 0x21e3cf5c8c8>]

array([0.50296664, 0.19994622, 0.14250226, 0.70048398, 0.74448897,
       0.85976709, 0.73253413, 0.88371578, 0.97180754, 0.05776667,
       0.03322522, 0.30821425, 0.16970345, 0.5127129 , 0.37546157,
       0.96229987, 0.79974288, 0.65953995, 0.83061176, 0.57372662])

How to implement a KNN model

1
2
3

def model(X,y):
    # 直接存储X,y即可
    return [(Xi,yi) for Xi, yi in zip(X,y)]

1	from scipy.spatial.distance import cosine

1 2	def distance(x1,x2): return cosine(x1,x2)

def prdict(x,k=5):
    #在predicate 的时候，需要做大量的计算
    most_similars =sorted(model(X,y),key = lambda xi:distance(xi[0],x))[:k]
    
    # -> regression:numerical -> most_similars(y)
    # ->classification:categorical -> most_similar(y)
    
    # 已经获得了最相似的数据集
    # 然后呢，Counter()-> most_common()->就可以获得出现最多的这个y了

不是简简单单的学一个算法，看到背后的思维方式：

贝叶斯，线性回归，决策树，KNN

新的问题，是不存在现成的解决方案的，但是，我们可以依据前人比较成熟的思维方法，我们发明新的方法。

How to implement a Decision Tree

1	from collections import Counter

信息熵

$Entropy = -\sum_i^n Pr(x_i) log(Pr(x_i))$

Gini 纯度

$Gini= 1 - \sum_{i=1}^J P_i^2$

1
2

1	! pip install icecream

Collecting icecream
  Downloading https://files.pythonhosted.org/packages/8c/ec/821ef939e8e4f4306e7263afa7e2ce0b4c5da9e6e53d1cc97b01606035f8/icecream-2.0.0-py2.py3-none-any.whl
Requirement already satisfied: colorama>=0.3.9 in d:\anaconda\lib\site-packages (from icecream) (0.4.1)
Collecting asttokens>=2.0.1 (from icecream)
  Downloading https://files.pythonhosted.org/packages/e8/18/41e95b4a6b4fd3ae704e672da5d070272518995f580be79d772be312c4af/asttokens-2.0.3-py2.py3-none-any.whl
Requirement already satisfied: pygments>=2.2.0 in d:\anaconda\lib\site-packages (from icecream) (2.4.2)
Collecting executing>=0.3.1 (from icecream)
  Downloading https://files.pythonhosted.org/packages/79/a1/f85482473b12b2b0e1fa10da84d4280930dbd6e4e149cedf7ae91f894138/executing-0.4.1.tar.gz
Requirement already satisfied: six in d:\anaconda\lib\site-packages (from asttokens>=2.0.1->icecream) (1.12.0)
Building wheels for collected packages: executing
  Building wheel for executing (setup.py): started
  Building wheel for executing (setup.py): finished with status 'done'
  Created wheel for executing: filename=executing-0.4.1-cp37-none-any.whl size=8302 sha256=2fce6277eb7197756482de660a24f6d2c80bb965838cbbb515106f385e4ccad3
  Stored in directory: C:\Users\tb\AppData\Local\pip\Cache\wheels\b0\71\dc\c1bdcd4b384c4458b639dfa905bc093979b8779f2e0df78792
Successfully built executing
Installing collected packages: asttokens, executing, icecream
Successfully installed asttokens-2.0.3 executing-0.4.1 icecream-2.0.0

1	from icecream import ic

def entropy(elements):
    '''群体混乱程度'''
    counter = Counter(elements)
    probs = [counter[c] / len(elements) for c in set(elements)]
    ic(probs)
    return - sum(p * np.log(p) for p in probs)

1	entropy([1,1,1,1])

ic| probs: [1.0]





-0.0

1	entropy([1,1,1,0])

ic| probs: [0.25, 0.75]





0.5623351446188083

1	entropy([2,3,3,3])

ic| probs: [0.25, 0.75]





0.5623351446188083

1	entropy([2,3,3,4])

ic| probs: [0.25, 0.5, 0.25]





1.0397207708399179

1	entropy([1,2,2,3])

ic| probs: [0.25, 0.5, 0.25]





1.0397207708399179

1	entropy([1,2,3,4])

ic| probs: [0.25, 0.25, 0.25, 0.25]





1.3862943611198906

决策树怎么来决定，哪一个特征来进行分割呢？

mock_data = {
    'gender':['F','F','F','F','M','M','M'],
    'income':['+10','-10','+10','+10','+10','+10','-10'],
    'family_number':[1,1,2,1,1,1,2],
    #'pet':[1,1,1,0,0,0,1],
    'bought':[1,1,1,0,0,0,1],
}

1	import pandas as pd

1	dataset = pd.DataFrame.from_dict(mock_data)

dataset

	gender	income	family_number	bought
0	F	+10	1	1
1	F	-10	1	1
2	F	+10	2	1
3	F	+10	1	0
4	M	+10	1	0
5	M	+10	1	0
6	M	-10	2	1

如果我们来了一个新的case:::[F,-10,2,1]->?

:::[F,+10,2,0]->?

# split_bt_gender:
print(entropy([1,1,1,0]) + entropy([0,0,1]))

# split_by_income:
print(entropy([1,1,0,0,0]) + entropy([1,1]))

#split_by_family_number
print(entropy([1,1,0,0,0])+entropy([1,1]))

# split_by_some_feature:
print(entropy([1,1,1,1])+entropy([0,0,0]))

ic| probs: [0.25, 0.75]
ic| probs: [0.6666666666666666, 0.3333333333333333]
ic| probs: [0.6, 0.4]
ic| probs: [1.0]
ic| probs: [0.6, 0.4]
ic| probs: [1.0]
ic| probs: [1.0]
ic| probs: [1.0]


1.198849312913621
0.6730116670092565
0.6730116670092565
-0.0

决策树在选择决策过程，决策顺序的时候，其实是按照，根据这个特征，进行分割之后，数据的熵最少原则进行的。

1	set(mock_data['family_number'])

{1, 2}

1	set(mock_data['gender'])

{'F', 'M'}

1	sub_split_1 = dataset[dataset['family_number']==1]['bought'].tolist()

1	sub_split_1

[1, 1, 0, 0, 0]

1	sub_split_2 =dataset[dataset['family_number']!=1]['bought'].tolist()

1	sub_split_2

[1, 1]

1	splited_data =dataset[dataset['family_number']==1]

1	splited_data

	gender	income	family_number	pet	bought
0	F	+10	1	1	1
1	F	-10	1	1	1
3	F	+10	1	0	0
4	M	+10	1	0	0
5	M	+10	1	0	0

1	splited_data[splited_data['income'] == '+10']

	gender	income	family_number	pet	bought
0	F	+10	1	1	1
3	F	+10	1	0	0
4	M	+10	1	0	0
5	M	+10	1	0	0

===> 根据信息熵，我们得到了一个决策过程：

第一步：我们观察他的家庭成员：
         如果他的家庭成员是2人，那么就会购买，如果不是2人，我们继续决策，进入下一步
第二部：我们观察他的收入情况：
         如果他的收入是'+10',那么他有 3/4 的概率会购买，如果是'-10'，那么，他肯定不买

1	entropy(sub_split_1)

ic| probs: [0.6, 0.4]





0.6730116670092565

1	entropy(sub_split_2)

ic| probs: [1.0]





-0.0

1	set(dataset.columns.to_list()) - {'bought'}

{'family_number', 'gender', 'income', 'pet'}

def find_the_optimal_spilter(training_data:pd.DataFrame,target:str) -> str:
    x_fields = set(training_data.columns.tolist()) - {target}
    
    spliter = None
    min_entropy = float('inf')
    
    for f in x_fields:
        ic(f)
        values = set(training_data[f])
        ic(values)
        for v in values:
            sub_spliter_1 = training_data[training_data[f] == v][target].tolist()
            ic(sub_spliter_1)
            # split by the current feature and one value
            
            entropy_1 =entropy(sub_spliter_1)
            ic(entropy_1)
            
            sub_spliter_2 = training_data[training_data[f] !=v][target].tolist()
            ic(sub_spliter_2)
            
            entropy_2 = entropy(sub_spliter_2)
            ic(entropy_2)
            
            entropy_v =entropy_1 +entropy_2
            
            ic(entropy_v)
            
            if entropy_v <= min_entropy:
                min_entropy = entropy_v
                spliter =(f,v)
    print(' spliter is: {}'.format(spliter))
    print(' the min entropy is: {}'.format(min_entropy))
    
    return spliter

1	find_the_optimal_spilter(training_data=dataset,target='bought')

ic| f: 'pet'
ic| values: {0, 1}
ic| sub_spliter_1: [0, 0, 0]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 1, 1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: -0.0
ic| sub_spliter_1: [1, 1, 1, 1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [0, 0, 0]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: -0.0
ic| f: 'family_number'
ic| values: {1, 2}
ic| sub_spliter_1: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_1: 0.6730116670092565
ic| sub_spliter_2: [1, 1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6730116670092565
ic| sub_spliter_1: [1, 1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_2: 0.6730116670092565
ic| entropy_v: 0.6730116670092565
ic| f: 'income'
ic| values: {'-10', '+10'}
ic| sub_spliter_1: [1, 1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_2: 0.6730116670092565
ic| entropy_v: 0.6730116670092565
ic| sub_spliter_1: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_1: 0.6730116670092565
ic| sub_spliter_2: [1, 1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6730116670092565
ic| f: 'gender'
ic| values: {'F', 'M'}
ic| sub_spliter_1: [1, 1, 1, 0]
ic| probs: [0.25, 0.75]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: [0, 0, 1]
ic| probs: [0.6666666666666666, 0.3333333333333333]
ic| entropy_2: 0.6365141682948128
ic| entropy_v: 1.198849312913621
ic| sub_spliter_1: [0, 0, 1]
ic| probs: [0.6666666666666666, 0.3333333333333333]
ic| entropy_1: 0.6365141682948128
ic| sub_spliter_2: [1, 1, 1, 0]
ic| probs: [0.25, 0.75]
ic| entropy_2: 0.5623351446188083
ic| entropy_v: 1.198849312913621


 spliter is: ('pet', 1)
 the min entropy is: -0.0





('pet', 1)

1	dataset[dataset['family_number'] ==2]

	gender	income	family_number	pet	bought
2	F	+10	2	1	1
6	M	-10	2	1	1

1	dataset[dataset['family_number'] ==1]

	gender	income	family_number	pet	bought
0	F	+10	1	1	1
1	F	-10	1	1	1
3	F	+10	1	0	0
4	M	+10	1	0	0
5	M	+10	1	0	0

1	find_the_optimal_spilter(dataset[dataset['family_number'] ==1 ],'bought')

ic| f: 'family_number'
ic| values: {1}
ic| sub_spliter_1: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_1: 0.6730116670092565
ic| sub_spliter_2: []
ic| probs: []
ic| entropy_2: 0
ic| entropy_v: 0.6730116670092565
ic| f: 'income'
ic| values: {'-10', '+10'}
ic| sub_spliter_1: [1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_2: 0.5623351446188083
ic| entropy_v: 0.5623351446188083
ic| sub_spliter_1: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: [1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.5623351446188083
ic| f: 'gender'
ic| values: {'F', 'M'}
ic| sub_spliter_1: [1, 1, 0]
ic| probs: [0.3333333333333333, 0.6666666666666666]
ic| entropy_1: 0.6365141682948128
ic| sub_spliter_2: [0, 0]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6365141682948128
ic| sub_spliter_1: [0, 0]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 0]
ic| probs: [0.3333333333333333, 0.6666666666666666]
ic| entropy_2: 0.6365141682948128
ic| entropy_v: 0.6365141682948128


 spliter is: ('income', '+10')
 the min entropy is: 0.5623351446188083





('income', '+10')

1	fm_n_1 = dataset[dataset['family_number'] == 1]

1	fm_n_1[fm_n_1['income']=='+10']

	gender	income	family_number	bought
0	F	+10	1	1
3	F	+10	1	0
4	M	+10	1	0
5	M	+10	1	0

1	find_the_optimal_spilter(fm_n_1[fm_n_1['income'] == '+10'],'bought')

ic| f: 'family_number'
ic| values: {1}
ic| sub_spliter_1: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: []
ic| probs: []
ic| entropy_2: 0
ic| entropy_v: 0.5623351446188083
ic| f: 'income'
ic| values: {'+10'}
ic| sub_spliter_1: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: []
ic| probs: []
ic| entropy_2: 0
ic| entropy_v: 0.5623351446188083
ic| f: 'gender'
ic| values: {'F', 'M'}
ic| sub_spliter_1: [1, 0]
ic| probs: [0.5, 0.5]
ic| entropy_1: 0.6931471805599453
ic| sub_spliter_2: [0, 0]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6931471805599453
ic| sub_spliter_1: [0, 0]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 0]
ic| probs: [0.5, 0.5]
ic| entropy_2: 0.6931471805599453
ic| entropy_v: 0.6931471805599453


 spliter is: ('income', '+10')
 the min entropy is: 0.5623351446188083





('income', '+10')

1
2

Evaluation Methods

1.Accuracy

2. Precision

3.Recall

3.5 F1 Score, F2 Score

4. AUC

判断是不是垃圾邮件，是的话程序输出1，不是输出0

给了十个数据，这10个数据的真实情况是：

->[1,1,1,1,0,1,0,0,1,1]

F(x)

->[1,1,1,1,1,1,1,1,0,1]

Accuracy:预测的标签中预测正确的值的个数/总的预测的个数

—> 6/10

Precision: 所有说“是”的预测而且预测正确/所有说“是”的预测个数

—> 6/9

Recall:所有说“是”的预测而且预测正确 / 所有真正标签是“是”

—> 6/7

Recall 和 Precision 其实往往是互相 tradeoff

F1 Score = $ \frac{2precisionrecall}{precision + recall} $

—>AOC / AUC

到这一步就无法分割了

A simple example of kmeans

1	from sklearn.cluster import KMeans

1 2	X1 = [random.randint(0,100) for _ in range(100)] X2 = [random.randint(0,100) for _ in range(100)]

1	plt.scatter(X1,X2)

<matplotlib.collections.PathCollection at 0x21e43481048>

1	training_data = [[x1,x2] for x1,x2 in zip(X1,X2)]

1	cluster = KMeans(n_clusters=6,max_iter=500)

1	cluster.fit(training_data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=500,
       n_clusters=6, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

1	cluster.cluster_centers_

array([[85.5       , 41.78571429],
       [27.57894737, 18.89473684],
       [79.90909091, 84.22727273],
       [14.6       , 76.6       ],
       [51.06666667, 65.33333333],
       [62.73333333, 22.13333333]])

1	cluster.labels_

array([1, 3, 2, 5, 0, 1, 5, 0, 1, 2, 4, 3, 3, 4, 4, 0, 1, 0, 5, 0, 1, 4,
       1, 3, 4, 3, 5, 1, 1, 4, 2, 2, 3, 4, 2, 2, 5, 3, 2, 1, 4, 0, 3, 5,
       2, 4, 3, 2, 1, 3, 4, 1, 2, 5, 0, 5, 5, 0, 0, 2, 2, 1, 2, 2, 0, 0,
       4, 1, 5, 3, 5, 0, 2, 5, 3, 3, 5, 5, 5, 2, 1, 2, 2, 4, 1, 2, 4, 4,
       2, 1, 0, 1, 2, 3, 1, 2, 0, 1, 3, 4])

1	from collections import defaultdict

1	centers = defaultdict(list)

1 2	for label, location in zip(cluster.labels_,training_data): centers[label].append(location)

color = ['red','green','grey','black','yellow','orange']

for i,c in enumerate(centers):
    for location in centers[c]:
        plt.scatter(*location,c=color[i])
        
for center in cluster.cluster_centers_:
    plt.scatter(*center,s=100)

Kmeans 的计算复杂度

1	def distance(): return np.sqrt((x1 - x2)2 +(y1 - y2)2)

$O(I*N*k*d)$

N:10000 k:100 d:500 I:500 ->10**(5+2+2+2)=>10^11 ==>100亿

1
2