一枚NLPer小菜鸡

AI for NLP

Build Graph

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
coordination_source = """
{name:'兰州', geoCoord:[103.73, 36.03]},
{name:'嘉峪关', geoCoord:[98.17, 39.47]},
{name:'西宁', geoCoord:[101.74, 36.56]},
{name:'成都', geoCoord:[104.06, 30.67]},
{name:'石家庄', geoCoord:[114.48, 38.03]},
{name:'拉萨', geoCoord:[102.73, 25.04]},
{name:'贵阳', geoCoord:[106.71, 26.57]},
{name:'武汉', geoCoord:[114.31, 30.52]},
{name:'郑州', geoCoord:[113.65, 34.76]},
{name:'济南', geoCoord:[117, 36.65]},
{name:'南京', geoCoord:[118.78, 32.04]},
{name:'合肥', geoCoord:[117.27, 31.86]},
{name:'杭州', geoCoord:[120.19, 30.26]},
{name:'南昌', geoCoord:[115.89, 28.68]},
{name:'福州', geoCoord:[119.3, 26.08]},
{name:'广州', geoCoord:[113.23, 23.16]},
{name:'长沙', geoCoord:[113, 28.21]},
//{name:'海口', geoCoord:[110.35, 20.02]},
{name:'沈阳', geoCoord:[123.38, 41.8]},
{name:'长春', geoCoord:[125.35, 43.88]},
{name:'哈尔滨', geoCoord:[126.63, 45.75]},
{name:'太原', geoCoord:[112.53, 37.87]},
{name:'西安', geoCoord:[108.95, 34.27]},
//{name:'台湾', geoCoord:[121.30, 25.03]},
{name:'北京', geoCoord:[116.46, 39.92]},
{name:'上海', geoCoord:[121.48, 31.22]},
{name:'重庆', geoCoord:[106.54, 29.59]},
{name:'天津', geoCoord:[117.2, 39.13]},
{name:'呼和浩特', geoCoord:[111.65, 40.82]},
{name:'南宁', geoCoord:[108.33, 22.84]},
//{name:'西藏', geoCoord:[91.11, 29.97]},
{name:'银川', geoCoord:[106.27, 38.47]},
{name:'乌鲁木齐', geoCoord:[87.68, 43.77]},
{name:'香港', geoCoord:[114.17, 22.28]},
{name:'澳门', geoCoord:[113.54, 22.19]}
"""
1
re.findall("[\d\.]+","{name:'澳门', geoCoord:[113.54, 22.19]}")
['113.54', '22.19']

Get data from source using regular expression

1
import re

regular expression

[a-z][A-Z]a: negation colou?r: ? zero or onr of its previous character
colou?r: ? zero or onr of its previous character

  • : zero or more of its previous character +: one or more.:match any single character

  • : ^: start of the line

  • : $:end of the line
  • : | [cat|dog] : cat or dog
  • (da): make the string da like a character
1
l = "color or colour"
1
2
pattern = re.compile("colou?r")
pattern.findall(l)
['color', 'colour']
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def get_city_info(city_coordination):
city_location = {}
first = True
for line in city_coordination.split('\n'):
if line.startswith('//'): continue
if line.strip()=="":continue
city =re.findall("name:'(\w+)'",line)[0]
x_y = re.findall("Coord:\[(\d+.\d+),\s(\d+.\d+)\]",line)[0]
if first ==True:
print("x_y: ",x_y)
x_y = tuple(map(float,x_y))
city_location[city] = x_y
if first ==True:
print("city:",city )
print("x_y: ",x_y)
first =False
return city_location
1
city_info = get_city_info(coordination_source)
x_y:  ('103.73', '36.03')
city: 兰州
x_y:  (103.73, 36.03)
1
city_info
{'兰州': (103.73, 36.03),
 '嘉峪关': (98.17, 39.47),
 '西宁': (101.74, 36.56),
 '成都': (104.06, 30.67),
 '石家庄': (114.48, 38.03),
 '拉萨': (102.73, 25.04),
 '贵阳': (106.71, 26.57),
 '武汉': (114.31, 30.52),
 '郑州': (113.65, 34.76),
 '济南': (117.0, 36.65),
 '南京': (118.78, 32.04),
 '合肥': (117.27, 31.86),
 '杭州': (120.19, 30.26),
 '南昌': (115.89, 28.68),
 '福州': (119.3, 26.08),
 '广州': (113.23, 23.16),
 '长沙': (113.0, 28.21),
 '沈阳': (123.38, 41.8),
 '长春': (125.35, 43.88),
 '哈尔滨': (126.63, 45.75),
 '太原': (112.53, 37.87),
 '西安': (108.95, 34.27),
 '北京': (116.46, 39.92),
 '上海': (121.48, 31.22),
 '重庆': (106.54, 29.59),
 '天津': (117.2, 39.13),
 '呼和浩特': (111.65, 40.82),
 '南宁': (108.33, 22.84),
 '银川': (106.27, 38.47),
 '乌鲁木齐': (87.68, 43.77),
 '香港': (114.17, 22.28),
 '澳门': (113.54, 22.19)}

Compute distance between cities

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import math

def geo_distance(origin, destination):
"""
Calculate the Haversine distance.

Parameters
----------
origin: tuple of float
(lat,long)
destination: tuple of float
(lat,long)

Returns
-------
distance_in_km: float

Examples
--------
>>> origin = (48,1372,11.5756) #Munich
>>> destination = (52.5186,13.4083) #Berlin
>>> round(distance(origin,destination),1)
504.2
"""

lat1,lon1 = origin
lat2,lon2 = destination
radius =6371 # km

dlat = math.radians(lat2 - lat1)
dlon = math.radians(lon2 - lon1)
a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
math.sin(dlon / 2) * math.sin(dlon / 2))
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
d = radius * c

return d
1
2
3
4
def get_city_distance(city1,city2):
return geo_distance(city_info[city1],city_info[city2])

get_city_distance("上海","北京")
727.52769688981

Draw the graph

1
2
3
4
5
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
1
2
city_graph =nx.Graph()
city_graph.add_nodes_from(list(city_info.keys()))
1
nx.draw(city_graph,city_info,with_labels=True,node_size=30)

png

Build connection between. Let’s assume that two cities are connected if their distance is less than 700km

1
threshold = 700 # defined the threshold
1
from collections import defaultdict
1
2
3
4
5
6
7
8
9
10
11
12
def build_connection(city_info):
cities_connection = defaultdict(list)
cities = list(city_info.keys())
for c1 in cities:
for c2 in cities:
if c1 == c2 :
continue
if get_city_distance(c1,c2) < threshold:
cities_connection[c1].append(c2)
return cities_connection

cities_connection = build_connection(city_info)
1
cities_connection
defaultdict(list,
            {'兰州': ['嘉峪关', '西宁', '成都', '拉萨', '贵阳', '西安', '重庆', '南宁', '银川'],
             '嘉峪关': ['兰州', '西宁', '成都', '拉萨'],
             '西宁': ['兰州', '嘉峪关', '成都', '拉萨', '贵阳', '重庆', '银川'],
             '成都': ['兰州', '嘉峪关', '西宁', '拉萨', '贵阳', '西安', '重庆', '南宁', '银川'],
             '石家庄': ['武汉',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '南昌',
              '广州',
              '长沙',
              '太原',
              '西安',
              '北京',
              '天津',
              '呼和浩特'],
             '拉萨': ['兰州', '嘉峪关', '西宁', '成都', '贵阳', '重庆', '南宁', '银川'],
             '贵阳': ['兰州', '西宁', '成都', '拉萨', '西安', '重庆', '南宁', '银川'],
             '武汉': ['石家庄',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '杭州',
              '南昌',
              '福州',
              '广州',
              '长沙',
              '太原',
              '西安',
              '北京',
              '天津',
              '呼和浩特',
              '香港',
              '澳门'],
             '郑州': ['石家庄',
              '武汉',
              '济南',
              '南京',
              '合肥',
              '南昌',
              '广州',
              '长沙',
              '太原',
              '西安',
              '北京',
              '天津',
              '呼和浩特',
              '香港',
              '澳门'],
             '济南': ['石家庄',
              '武汉',
              '郑州',
              '南京',
              '合肥',
              '杭州',
              '南昌',
              '福州',
              '长沙',
              '太原',
              '北京',
              '上海',
              '天津',
              '呼和浩特'],
             '南京': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '合肥',
              '杭州',
              '南昌',
              '福州',
              '长沙',
              '北京',
              '上海',
              '天津'],
             '合肥': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '南京',
              '杭州',
              '南昌',
              '福州',
              '广州',
              '长沙',
              '太原',
              '北京',
              '上海',
              '天津',
              '香港',
              '澳门'],
             '杭州': ['武汉', '济南', '南京', '合肥', '南昌', '福州', '北京', '上海', '天津'],
             '南昌': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '杭州',
              '福州',
              '广州',
              '长沙',
              '太原',
              '北京',
              '上海',
              '天津',
              '香港',
              '澳门'],
             '福州': ['武汉',
              '济南',
              '南京',
              '合肥',
              '杭州',
              '南昌',
              '广州',
              '上海',
              '香港',
              '澳门'],
             '广州': ['石家庄',
              '武汉',
              '郑州',
              '合肥',
              '南昌',
              '福州',
              '长沙',
              '太原',
              '西安',
              '南宁',
              '香港',
              '澳门'],
             '长沙': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '南昌',
              '广州',
              '太原',
              '西安',
              '北京',
              '天津',
              '呼和浩特',
              '南宁',
              '香港',
              '澳门'],
             '沈阳': ['长春', '哈尔滨', '上海'],
             '长春': ['沈阳', '哈尔滨'],
             '哈尔滨': ['沈阳', '长春'],
             '太原': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '合肥',
              '南昌',
              '广州',
              '长沙',
              '西安',
              '北京',
              '天津',
              '呼和浩特',
              '银川',
              '澳门'],
             '西安': ['兰州',
              '成都',
              '石家庄',
              '贵阳',
              '武汉',
              '郑州',
              '广州',
              '长沙',
              '太原',
              '重庆',
              '呼和浩特',
              '南宁',
              '银川'],
             '北京': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '杭州',
              '南昌',
              '长沙',
              '太原',
              '天津',
              '呼和浩特'],
             '上海': ['济南', '南京', '合肥', '杭州', '南昌', '福州', '沈阳', '天津'],
             '重庆': ['兰州', '西宁', '成都', '拉萨', '贵阳', '西安', '呼和浩特', '南宁', '银川'],
             '天津': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '杭州',
              '南昌',
              '长沙',
              '太原',
              '北京',
              '上海',
              '呼和浩特'],
             '呼和浩特': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '长沙',
              '太原',
              '西安',
              '北京',
              '重庆',
              '天津',
              '银川'],
             '南宁': ['兰州',
              '成都',
              '拉萨',
              '贵阳',
              '广州',
              '长沙',
              '西安',
              '重庆',
              '银川',
              '香港',
              '澳门'],
             '银川': ['兰州',
              '西宁',
              '成都',
              '拉萨',
              '贵阳',
              '太原',
              '西安',
              '重庆',
              '呼和浩特',
              '南宁'],
             '香港': ['武汉', '郑州', '合肥', '南昌', '福州', '广州', '长沙', '南宁', '澳门'],
             '澳门': ['武汉',
              '郑州',
              '合肥',
              '南昌',
              '福州',
              '广州',
              '长沙',
              '太原',
              '南宁',
              '香港']})

Draw connection graph

1
cities_connection_graph = nx.Graph(cities_connection)
1
nx.draw(cities_connection_graph,city_info,with_labels = True, node_size=10)

png

BFS 1 version

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def search_1(graph, start, destination):
pathes = [[start]] # list 用来存储待搜索路径
visited = set() #set 用来存车已搜索节点


while pathes:
path= pathes.pop(0) #提取第一条路径
froniter = path[-1] #提取即将要探索的节点

if froniter in visited: continue #检查如果该点已经探索过,则不用探索

successors = graph[froniter]
for city in successors: # 遍历子节点
if city in path: continue # 检查会不会形成环

new_path = path +[city]
pathes.append(new_path) #bfs #将新路径加到list里面
# pathes = [new_path] + pathes #dfs
if city ==destination: #检查是否到达目的地
return new_path
visited.add(froniter)
1
search_1(cities_connection,"上海","石家庄")
['上海', '济南', '石家庄']

Optimal search using variation of BFS

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def search_2(graph,start,destination,search_strategy):
pathes = [[start]]
visited =set() # !
while pathes:
path = pathes.pop(0)
froniter =path[-1]
if froniter in visited:continue # !

if froniter == destination:
return path
successors = graph[froniter]

for city in successors:
if city in path:continue #check loop

new_path = path + [city]
pathes.append(new_path)

pathes =search_strategy(pathes)
visited.add(froniter) # !
#
1
2
3
4
5
6
7
def sort_by_distance(pathes):
def get_distance_of_path(path):
distance = 0
for i,_ in enumerate(path[:-1]):
distance += get_city_distance(path[i],path[i+1])
return distance
return sorted(pathes,key =get_distance_of_path)
1
2
3
4
5
def get_distance_of_path(path):
distance = 0
for i,_ in enumerate(path[:-1]):
distance += get_city_distance(path[i],path[i+1])
return distance
1
get_distance_of_path(["北京","济南","上海"])
752.66259009181
1
search_2(cities_connection,"北京","上海",search_strategy=lambda x:x)
['北京', '济南', '上海']
1
search_2(cities_connection,"北京","上海",search_strategy=sort_by_distance)
['北京', '天津', '上海']

Machine Learning

1
from sklearn.datasets import load_boston
1
dataset = load_boston()
1
#dataset
1
x,y=dataset['data'],dataset['target']
1
x.shape
(506, 13)
1
y.shape
(506,)
1
x[1].shape
(13,)
1
x[1]
array([2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
       6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
       1.7800e+01, 3.9690e+02, 9.1400e+00])
1
dataset.feature_names
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
1
dataset['DESCR']
".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n"
1
X_rm = x[:,5]
1
2
# plot the RM with respect to y
plt.scatter(X_rm,y)
<matplotlib.collections.PathCollection at 0x11955a9f148>

png

Gradient descent

Assume that the target function is a linear function

1
2
3
# define target function
def price(rm,k,b):
return k*rm + b

Define mean square loss

1
2
3
# define loss function
def loss(y,y_hat):
return sum((y_i - y_hat_i)**2 for y_i,y_hat_i in zip(list(y),list(y_hat)))/len(list(y))

Define partial derivatives

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#define partial derivative
def partial_derivative_k(x,y,y_hat):
n = len(y)
gradient = 0
for x_i,y_i,y_hat_i in zip(list(x),list(y),list(y_hat)):
gradient += (y_i - y_hat_i) * x_i
return -2/n * gradient

def partial_derivative_b(y,y_hat):
n =len(y)
gradient = 0
for y_i,y_hat_i in zip(list(y),list(y_hat)):
gradient+= (y_i-y_hat_i)
return -2/n* gradient
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import random
#initialized parameters
k= random.random()*200-100 # -100 100
b= random.random()*200-100 # -100 100

learning_rate = 1e-3

iteration_num =200
losses = []
for i in range(iteration_num):
price_use_current_parameters = [price(r,k,b) for r in X_rm] # \hat{y}

current_loss = loss(y,price_use_current_parameters)
losses.append(current_loss)
#print("Iteration {},the loss is {},parameters k is {} and b is {}".format(i,current_loss,k,b))

k_gradient = partial_derivative_k(X_rm,y,price_use_current_parameters)
b_gradient = partial_derivative_b(y,price_use_current_parameters)

k = k + (-1 * k_gradient) * learning_rate
b = b + (-1 * b_gradient) * learning_rate

best_k =k
best_b =b
print("best_k is{},best_b is {}".format(best_k,best_b))
best_k is12.41253129958806,best_b is -55.72859329657179
1
plt.plot(list(range(iteration_num)),losses)
[<matplotlib.lines.Line2D at 0x119550b6fc8>]

png

1
2
3
4
price_use_best_paramters = [price(r,best_k,best_b) for r in X_rm]

plt.scatter(X_rm,y)
plt.scatter(X_rm,price_use_current_parameters)
<matplotlib.collections.PathCollection at 0x11955a2d188>

png

1
2


O(∩_∩)O哈哈~