一枚NLPer小菜鸡

Machinglearing Model,the first step

1
%matplotlib inline
1
import numpy as np
1
random_data = np.random.random((20,2))
1
random_data
array([[0.50296664, 0.62445093],
       [0.19994622, 0.19373156],
       [0.14250226, 0.61334931],
       [0.70048398, 0.75160392],
       [0.74448897, 0.7320275 ],
       [0.85976709, 0.84319014],
       [0.73253413, 0.12288743],
       [0.88371578, 0.84136921],
       [0.97180754, 0.79078425],
       [0.05776667, 0.92731363],
       [0.03322522, 0.36021126],
       [0.30821425, 0.57943347],
       [0.16970345, 0.72740845],
       [0.5127129 , 0.44245729],
       [0.37546157, 0.477542  ],
       [0.96229987, 0.98167783],
       [0.79974288, 0.20093964],
       [0.65953995, 0.83289056],
       [0.83061176, 0.10022954],
       [0.57372662, 0.74422547]])
1
import matplotlib.pyplot as plt
1
X = random_data[:,0]
1
Y = random_data[:,1]
1
import random
1
2
3
4
5
6
def assuming_function(x):
# 在我们的日常生活中是常见的
# 体重-> 高血压的概率
# 收入-> 买阿玛尼的概率
# 其实都是一种潜在的函数关系 + 一个随机变化
return 13.4 *x + 5 + random.randint(-5,5)
1
y = [assuming_function(x) for x in X]
1
plt.scatter(X,y)
<matplotlib.collections.PathCollection at 0x21e39725708>

png

1
y = np.array(y)

Regression -> Real Number

Classification -> [0,0,0,1],[0,1,0,0]

1
y
array([ 9.73975302,  6.67927941,  3.90953023, 10.38648538, 13.97615214,
       20.520879  , 10.81595738, 14.84179151, 19.02222101,  8.77407336,
        0.44521789, 13.13007093,  4.27402617,  7.87035283,  9.03118504,
       13.89481826, 18.71655464, 15.83783527, 16.1301976 , 10.68793665])
1
2
import numpy as py
from sklearn.linear_model import LinearRegression
1
reg = LinearRegression().fit(X.reshape(-1,1),y)
1
reg.score(X.reshape(-1,1),y)
0.7115853500059341
1
reg.coef_
array([14.50455278])
1
reg.intercept_
3.441324165071637
1
2
def f(x):
return reg.coef_ * x + reg.intercept_
1
2
plt.scatter(X,y)
plt.plot(X,f(X),color='red')
[<matplotlib.lines.Line2D at 0x21e3cf5c8c8>]

png

1
X
array([0.50296664, 0.19994622, 0.14250226, 0.70048398, 0.74448897,
       0.85976709, 0.73253413, 0.88371578, 0.97180754, 0.05776667,
       0.03322522, 0.30821425, 0.16970345, 0.5127129 , 0.37546157,
       0.96229987, 0.79974288, 0.65953995, 0.83061176, 0.57372662])

How to implement a KNN model

1
2
3
def model(X,y):
# 直接存储X,y即可
return [(Xi,yi) for Xi, yi in zip(X,y)]
1
from scipy.spatial.distance import cosine
1
2
def distance(x1,x2):
return cosine(x1,x2)
1
2
3
4
5
6
7
8
9
def prdict(x,k=5):
#在predicate 的时候,需要做大量的计算
most_similars =sorted(model(X,y),key = lambda xi:distance(xi[0],x))[:k]

# -> regression:numerical -> most_similars(y)
# ->classification:categorical -> most_similar(y)

# 已经获得了最相似的数据集
# 然后呢,Counter()-> most_common()->就可以获得出现最多的这个y了

不是简简单单的学一个算法,看到背后的思维方式:

贝叶斯,线性回归,决策树,KNN

新的问题,是不存在现成的解决方案的,但是,我们可以依据前人比较成熟的思维方法,我们发明新的方法。

How to implement a Decision Tree

1
from collections import Counter

信息熵

Gini 纯度

1
2


1
! pip install icecream
Collecting icecream
  Downloading https://files.pythonhosted.org/packages/8c/ec/821ef939e8e4f4306e7263afa7e2ce0b4c5da9e6e53d1cc97b01606035f8/icecream-2.0.0-py2.py3-none-any.whl
Requirement already satisfied: colorama>=0.3.9 in d:\anaconda\lib\site-packages (from icecream) (0.4.1)
Collecting asttokens>=2.0.1 (from icecream)
  Downloading https://files.pythonhosted.org/packages/e8/18/41e95b4a6b4fd3ae704e672da5d070272518995f580be79d772be312c4af/asttokens-2.0.3-py2.py3-none-any.whl
Requirement already satisfied: pygments>=2.2.0 in d:\anaconda\lib\site-packages (from icecream) (2.4.2)
Collecting executing>=0.3.1 (from icecream)
  Downloading https://files.pythonhosted.org/packages/79/a1/f85482473b12b2b0e1fa10da84d4280930dbd6e4e149cedf7ae91f894138/executing-0.4.1.tar.gz
Requirement already satisfied: six in d:\anaconda\lib\site-packages (from asttokens>=2.0.1->icecream) (1.12.0)
Building wheels for collected packages: executing
  Building wheel for executing (setup.py): started
  Building wheel for executing (setup.py): finished with status 'done'
  Created wheel for executing: filename=executing-0.4.1-cp37-none-any.whl size=8302 sha256=2fce6277eb7197756482de660a24f6d2c80bb965838cbbb515106f385e4ccad3
  Stored in directory: C:\Users\tb\AppData\Local\pip\Cache\wheels\b0\71\dc\c1bdcd4b384c4458b639dfa905bc093979b8779f2e0df78792
Successfully built executing
Installing collected packages: asttokens, executing, icecream
Successfully installed asttokens-2.0.3 executing-0.4.1 icecream-2.0.0
1
from icecream import ic
1
2
3
4
5
6
def entropy(elements):
'''群体混乱程度'''
counter = Counter(elements)
probs = [counter[c] / len(elements) for c in set(elements)]
ic(probs)
return - sum(p * np.log(p) for p in probs)
1
entropy([1,1,1,1])
ic| probs: [1.0]





-0.0
1
entropy([1,1,1,0])
ic| probs: [0.25, 0.75]





0.5623351446188083
1
entropy([2,3,3,3])
ic| probs: [0.25, 0.75]





0.5623351446188083
1
entropy([2,3,3,4])
ic| probs: [0.25, 0.5, 0.25]





1.0397207708399179
1
entropy([1,2,2,3])
ic| probs: [0.25, 0.5, 0.25]





1.0397207708399179
1
entropy([1,2,3,4])
ic| probs: [0.25, 0.25, 0.25, 0.25]





1.3862943611198906

决策树怎么来决定,哪一个特征来进行分割呢?

1
2
3
4
5
6
7
mock_data = {
'gender':['F','F','F','F','M','M','M'],
'income':['+10','-10','+10','+10','+10','+10','-10'],
'family_number':[1,1,2,1,1,1,2],
#'pet':[1,1,1,0,0,0,1],
'bought':[1,1,1,0,0,0,1],
}
1
import pandas as pd
1
dataset = pd.DataFrame.from_dict(mock_data)
1
dataset
gender income family_number bought
0 F +10 1 1
1 F -10 1 1
2 F +10 2 1
3 F +10 1 0
4 M +10 1 0
5 M +10 1 0
6 M -10 2 1

如果我们来了一个新的case:::[F,-10,2,1]->?

:::[F,+10,2,0]->?

1
2
3
4
5
6
7
8
9
10
11
# split_bt_gender:
print(entropy([1,1,1,0]) + entropy([0,0,1]))

# split_by_income:
print(entropy([1,1,0,0,0]) + entropy([1,1]))

#split_by_family_number
print(entropy([1,1,0,0,0])+entropy([1,1]))

# split_by_some_feature:
print(entropy([1,1,1,1])+entropy([0,0,0]))
ic| probs: [0.25, 0.75]
ic| probs: [0.6666666666666666, 0.3333333333333333]
ic| probs: [0.6, 0.4]
ic| probs: [1.0]
ic| probs: [0.6, 0.4]
ic| probs: [1.0]
ic| probs: [1.0]
ic| probs: [1.0]


1.198849312913621
0.6730116670092565
0.6730116670092565
-0.0

决策树在选择决策过程,决策顺序的时候,其实是按照,根据这个特征,进行分割之后,数据的熵最少原则进行的。

1
set(mock_data['family_number'])
{1, 2}
1
set(mock_data['gender'])
{'F', 'M'}
1
sub_split_1 = dataset[dataset['family_number']==1]['bought'].tolist()
1
sub_split_1
[1, 1, 0, 0, 0]
1
sub_split_2 =dataset[dataset['family_number']!=1]['bought'].tolist()
1
sub_split_2
[1, 1]
1
splited_data =dataset[dataset['family_number']==1]
1
splited_data
gender income family_number pet bought
0 F +10 1 1 1
1 F -10 1 1 1
3 F +10 1 0 0
4 M +10 1 0 0
5 M +10 1 0 0
1
splited_data[splited_data['income'] == '+10']
gender income family_number pet bought
0 F +10 1 1 1
3 F +10 1 0 0
4 M +10 1 0 0
5 M +10 1 0 0

===> 根据信息熵,我们得到了一个决策过程:

1
2
3
4
第一步:我们观察他的家庭成员:
如果他的家庭成员是2人,那么就会购买,如果不是2人,我们继续决策,进入下一步
第二部:我们观察他的收入情况:
如果他的收入是'+10',那么他有 3/4 的概率会购买,如果是'-10',那么,他肯定不买
1
entropy(sub_split_1)
ic| probs: [0.6, 0.4]





0.6730116670092565
1
entropy(sub_split_2)
ic| probs: [1.0]





-0.0
1
set(dataset.columns.to_list()) - {'bought'}
{'family_number', 'gender', 'income', 'pet'}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def find_the_optimal_spilter(training_data:pd.DataFrame,target:str) -> str:
x_fields = set(training_data.columns.tolist()) - {target}

spliter = None
min_entropy = float('inf')

for f in x_fields:
ic(f)
values = set(training_data[f])
ic(values)
for v in values:
sub_spliter_1 = training_data[training_data[f] == v][target].tolist()
ic(sub_spliter_1)
# split by the current feature and one value

entropy_1 =entropy(sub_spliter_1)
ic(entropy_1)

sub_spliter_2 = training_data[training_data[f] !=v][target].tolist()
ic(sub_spliter_2)

entropy_2 = entropy(sub_spliter_2)
ic(entropy_2)

entropy_v =entropy_1 +entropy_2

ic(entropy_v)

if entropy_v <= min_entropy:
min_entropy = entropy_v
spliter =(f,v)
print(' spliter is: {}'.format(spliter))
print(' the min entropy is: {}'.format(min_entropy))

return spliter
1
find_the_optimal_spilter(training_data=dataset,target='bought')
ic| f: 'pet'
ic| values: {0, 1}
ic| sub_spliter_1: [0, 0, 0]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 1, 1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: -0.0
ic| sub_spliter_1: [1, 1, 1, 1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [0, 0, 0]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: -0.0
ic| f: 'family_number'
ic| values: {1, 2}
ic| sub_spliter_1: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_1: 0.6730116670092565
ic| sub_spliter_2: [1, 1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6730116670092565
ic| sub_spliter_1: [1, 1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_2: 0.6730116670092565
ic| entropy_v: 0.6730116670092565
ic| f: 'income'
ic| values: {'-10', '+10'}
ic| sub_spliter_1: [1, 1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_2: 0.6730116670092565
ic| entropy_v: 0.6730116670092565
ic| sub_spliter_1: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_1: 0.6730116670092565
ic| sub_spliter_2: [1, 1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6730116670092565
ic| f: 'gender'
ic| values: {'F', 'M'}
ic| sub_spliter_1: [1, 1, 1, 0]
ic| probs: [0.25, 0.75]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: [0, 0, 1]
ic| probs: [0.6666666666666666, 0.3333333333333333]
ic| entropy_2: 0.6365141682948128
ic| entropy_v: 1.198849312913621
ic| sub_spliter_1: [0, 0, 1]
ic| probs: [0.6666666666666666, 0.3333333333333333]
ic| entropy_1: 0.6365141682948128
ic| sub_spliter_2: [1, 1, 1, 0]
ic| probs: [0.25, 0.75]
ic| entropy_2: 0.5623351446188083
ic| entropy_v: 1.198849312913621


 spliter is: ('pet', 1)
 the min entropy is: -0.0





('pet', 1)
1
dataset[dataset['family_number'] ==2]
gender income family_number pet bought
2 F +10 2 1 1
6 M -10 2 1 1
1
dataset[dataset['family_number'] ==1]
gender income family_number pet bought
0 F +10 1 1 1
1 F -10 1 1 1
3 F +10 1 0 0
4 M +10 1 0 0
5 M +10 1 0 0
1
find_the_optimal_spilter(dataset[dataset['family_number'] ==1 ],'bought')
ic| f: 'family_number'
ic| values: {1}
ic| sub_spliter_1: [1, 1, 0, 0, 0]
ic| probs: [0.6, 0.4]
ic| entropy_1: 0.6730116670092565
ic| sub_spliter_2: []
ic| probs: []
ic| entropy_2: 0
ic| entropy_v: 0.6730116670092565
ic| f: 'income'
ic| values: {'-10', '+10'}
ic| sub_spliter_1: [1]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_2: 0.5623351446188083
ic| entropy_v: 0.5623351446188083
ic| sub_spliter_1: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: [1]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.5623351446188083
ic| f: 'gender'
ic| values: {'F', 'M'}
ic| sub_spliter_1: [1, 1, 0]
ic| probs: [0.3333333333333333, 0.6666666666666666]
ic| entropy_1: 0.6365141682948128
ic| sub_spliter_2: [0, 0]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6365141682948128
ic| sub_spliter_1: [0, 0]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 1, 0]
ic| probs: [0.3333333333333333, 0.6666666666666666]
ic| entropy_2: 0.6365141682948128
ic| entropy_v: 0.6365141682948128


 spliter is: ('income', '+10')
 the min entropy is: 0.5623351446188083





('income', '+10')
1
fm_n_1 = dataset[dataset['family_number'] == 1]
1
fm_n_1[fm_n_1['income']=='+10']
gender income family_number bought
0 F +10 1 1
3 F +10 1 0
4 M +10 1 0
5 M +10 1 0
1
find_the_optimal_spilter(fm_n_1[fm_n_1['income'] == '+10'],'bought')
ic| f: 'family_number'
ic| values: {1}
ic| sub_spliter_1: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: []
ic| probs: []
ic| entropy_2: 0
ic| entropy_v: 0.5623351446188083
ic| f: 'income'
ic| values: {'+10'}
ic| sub_spliter_1: [1, 0, 0, 0]
ic| probs: [0.75, 0.25]
ic| entropy_1: 0.5623351446188083
ic| sub_spliter_2: []
ic| probs: []
ic| entropy_2: 0
ic| entropy_v: 0.5623351446188083
ic| f: 'gender'
ic| values: {'F', 'M'}
ic| sub_spliter_1: [1, 0]
ic| probs: [0.5, 0.5]
ic| entropy_1: 0.6931471805599453
ic| sub_spliter_2: [0, 0]
ic| probs: [1.0]
ic| entropy_2: -0.0
ic| entropy_v: 0.6931471805599453
ic| sub_spliter_1: [0, 0]
ic| probs: [1.0]
ic| entropy_1: -0.0
ic| sub_spliter_2: [1, 0]
ic| probs: [0.5, 0.5]
ic| entropy_2: 0.6931471805599453
ic| entropy_v: 0.6931471805599453


 spliter is: ('income', '+10')
 the min entropy is: 0.5623351446188083





('income', '+10')
1
2


Evaluation Methods

1.Accuracy

2. Precision

3.Recall

3.5 F1 Score, F2 Score

4. AUC

判断是不是垃圾邮件,是的话程序输出1,不是输出0

给了十个数据,这10个数据的真实情况是:

->[1,1,1,1,0,1,0,0,1,1]

F(x)

->[1,1,1,1,1,1,1,1,0,1]

Accuracy:预测的标签中预测正确的值的个数/总的预测的个数

—> 6/10

Precision: 所有说“是”的预测而且预测正确/所有说“是”的预测个数

—> 6/9

Recall:所有说“是”的预测而且预测正确 / 所有真正标签是“是”

—> 6/7

Recall 和 Precision 其实往往是互相 tradeoff

F1 Score = $ \frac{2precisionrecall}{precision + recall} $

—>AOC / AUC

到这一步就无法分割了

A simple example of kmeans

1
from sklearn.cluster import KMeans
1
2
X1 = [random.randint(0,100) for _ in range(100)]
X2 = [random.randint(0,100) for _ in range(100)]
1
plt.scatter(X1,X2)
<matplotlib.collections.PathCollection at 0x21e43481048>

png

1
training_data = [[x1,x2] for x1,x2 in zip(X1,X2)]
1
cluster = KMeans(n_clusters=6,max_iter=500)
1
cluster.fit(training_data)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=500,
       n_clusters=6, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)
1
cluster.cluster_centers_
array([[85.5       , 41.78571429],
       [27.57894737, 18.89473684],
       [79.90909091, 84.22727273],
       [14.6       , 76.6       ],
       [51.06666667, 65.33333333],
       [62.73333333, 22.13333333]])
1
cluster.labels_
array([1, 3, 2, 5, 0, 1, 5, 0, 1, 2, 4, 3, 3, 4, 4, 0, 1, 0, 5, 0, 1, 4,
       1, 3, 4, 3, 5, 1, 1, 4, 2, 2, 3, 4, 2, 2, 5, 3, 2, 1, 4, 0, 3, 5,
       2, 4, 3, 2, 1, 3, 4, 1, 2, 5, 0, 5, 5, 0, 0, 2, 2, 1, 2, 2, 0, 0,
       4, 1, 5, 3, 5, 0, 2, 5, 3, 3, 5, 5, 5, 2, 1, 2, 2, 4, 1, 2, 4, 4,
       2, 1, 0, 1, 2, 3, 1, 2, 0, 1, 3, 4])
1
from collections import defaultdict
1
centers = defaultdict(list)
1
2
for label, location in zip(cluster.labels_,training_data):
centers[label].append(location)
1
2
3
4
5
6
7
8
color = ['red','green','grey','black','yellow','orange']

for i,c in enumerate(centers):
for location in centers[c]:
plt.scatter(*location,c=color[i])

for center in cluster.cluster_centers_:
plt.scatter(*center,s=100)

png

Kmeans 的计算复杂度

1
def distance(): return np.sqrt((x1 - x2)**2 +(y1 - y2)**2)

N:10000 k:100 d:500 I:500 ->10**(5+2+2+2)=>10^11 ==>100亿

1
2


O(∩_∩)O哈哈~