AI for NLP

Build Graph

coordination_source = """
{name:'兰州', geoCoord:[103.73, 36.03]},
{name:'嘉峪关', geoCoord:[98.17, 39.47]},
{name:'西宁', geoCoord:[101.74, 36.56]},
{name:'成都', geoCoord:[104.06, 30.67]},
{name:'石家庄', geoCoord:[114.48, 38.03]},
{name:'拉萨', geoCoord:[102.73, 25.04]},
{name:'贵阳', geoCoord:[106.71, 26.57]},
{name:'武汉', geoCoord:[114.31, 30.52]},
{name:'郑州', geoCoord:[113.65, 34.76]},
{name:'济南', geoCoord:[117, 36.65]},
{name:'南京', geoCoord:[118.78, 32.04]},
{name:'合肥', geoCoord:[117.27, 31.86]},
{name:'杭州', geoCoord:[120.19, 30.26]},
{name:'南昌', geoCoord:[115.89, 28.68]},
{name:'福州', geoCoord:[119.3, 26.08]},
{name:'广州', geoCoord:[113.23, 23.16]},
{name:'长沙', geoCoord:[113, 28.21]},
//{name:'海口', geoCoord:[110.35, 20.02]},
{name:'沈阳', geoCoord:[123.38, 41.8]},
{name:'长春', geoCoord:[125.35, 43.88]},
{name:'哈尔滨', geoCoord:[126.63, 45.75]},
{name:'太原', geoCoord:[112.53, 37.87]},
{name:'西安', geoCoord:[108.95, 34.27]},
//{name:'台湾', geoCoord:[121.30, 25.03]},
{name:'北京', geoCoord:[116.46, 39.92]},
{name:'上海', geoCoord:[121.48, 31.22]},
{name:'重庆', geoCoord:[106.54, 29.59]},
{name:'天津', geoCoord:[117.2, 39.13]},
{name:'呼和浩特', geoCoord:[111.65, 40.82]},
{name:'南宁', geoCoord:[108.33, 22.84]},
//{name:'西藏', geoCoord:[91.11, 29.97]},
{name:'银川', geoCoord:[106.27, 38.47]},
{name:'乌鲁木齐', geoCoord:[87.68, 43.77]},
{name:'香港', geoCoord:[114.17, 22.28]},
{name:'澳门', geoCoord:[113.54, 22.19]}
"""

1	re.findall("[\d\.]+","{name:'澳门', geoCoord:[113.54, 22.19]}")

['113.54', '22.19']

Get data from source using regular expression

import re

regular expression

[a-z][A-Z]^a: negation colou?r: ? zero or onr of its previous character
colou?r: ? zero or onr of its previous character

: zero or more of its previous character +: one or more.:match any single character
: ^: start of the line
: $:end of the line
: | [cat|dog] : cat or dog
(da): make the string da like a character

1	l = "color or colour"

1 2	pattern = re.compile("colou?r") pattern.findall(l)

['color', 'colour']

def get_city_info(city_coordination):
    city_location = {}
    first = True
    for line in city_coordination.split('\n'):
        if line.startswith('//'): continue
        if line.strip()=="":continue    
        city =re.findall("name:'(\w+)'",line)[0]
        x_y = re.findall("Coord:\[(\d+.\d+),\s(\d+.\d+)\]",line)[0]
        if first ==True:
            print("x_y: ",x_y)
        x_y = tuple(map(float,x_y))
        city_location[city] = x_y
        if first ==True:
            print("city:",city )
            print("x_y: ",x_y)
            first =False
    return city_location

1	city_info = get_city_info(coordination_source)

x_y:  ('103.73', '36.03')
city: 兰州
x_y:  (103.73, 36.03)

city_info

{'兰州': (103.73, 36.03),
 '嘉峪关': (98.17, 39.47),
 '西宁': (101.74, 36.56),
 '成都': (104.06, 30.67),
 '石家庄': (114.48, 38.03),
 '拉萨': (102.73, 25.04),
 '贵阳': (106.71, 26.57),
 '武汉': (114.31, 30.52),
 '郑州': (113.65, 34.76),
 '济南': (117.0, 36.65),
 '南京': (118.78, 32.04),
 '合肥': (117.27, 31.86),
 '杭州': (120.19, 30.26),
 '南昌': (115.89, 28.68),
 '福州': (119.3, 26.08),
 '广州': (113.23, 23.16),
 '长沙': (113.0, 28.21),
 '沈阳': (123.38, 41.8),
 '长春': (125.35, 43.88),
 '哈尔滨': (126.63, 45.75),
 '太原': (112.53, 37.87),
 '西安': (108.95, 34.27),
 '北京': (116.46, 39.92),
 '上海': (121.48, 31.22),
 '重庆': (106.54, 29.59),
 '天津': (117.2, 39.13),
 '呼和浩特': (111.65, 40.82),
 '南宁': (108.33, 22.84),
 '银川': (106.27, 38.47),
 '乌鲁木齐': (87.68, 43.77),
 '香港': (114.17, 22.28),
 '澳门': (113.54, 22.19)}

Compute distance between cities

import math

def geo_distance(origin, destination):
    """
    Calculate the Haversine distance.
    
    Parameters
    ----------
    origin: tuple of float
        (lat,long)
    destination: tuple of float
        (lat,long)
    
    Returns
    -------
    distance_in_km: float
    
    Examples
    --------
    >>> origin = (48,1372,11.5756) #Munich
    >>> destination = (52.5186,13.4083) #Berlin
    >>> round(distance(origin,destination),1)
    504.2
    """
    
    lat1,lon1 = origin
    lat2,lon2 = destination
    radius =6371 # km
    
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
         math.sin(dlon / 2) * math.sin(dlon / 2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c

    return d

def get_city_distance(city1,city2):
    return geo_distance(city_info[city1],city_info[city2])

get_city_distance("上海","北京")

727.52769688981

Draw the graph

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

1 2	city_graph =nx.Graph() city_graph.add_nodes_from(list(city_info.keys()))

1	nx.draw(city_graph,city_info,with_labels=True,node_size=30)

Build connection between. Let’s assume that two cities are connected if their distance is less than 700km

1	threshold = 700 # defined the threshold

1	from collections import defaultdict

def build_connection(city_info):
    cities_connection = defaultdict(list)
    cities = list(city_info.keys())
    for c1 in cities:
        for c2 in cities:
            if c1 == c2 :
                continue
            if get_city_distance(c1,c2) < threshold:
                cities_connection[c1].append(c2)
    return cities_connection

cities_connection = build_connection(city_info)

1	cities_connection

defaultdict(list,
            {'兰州': ['嘉峪关', '西宁', '成都', '拉萨', '贵阳', '西安', '重庆', '南宁', '银川'],
             '嘉峪关': ['兰州', '西宁', '成都', '拉萨'],
             '西宁': ['兰州', '嘉峪关', '成都', '拉萨', '贵阳', '重庆', '银川'],
             '成都': ['兰州', '嘉峪关', '西宁', '拉萨', '贵阳', '西安', '重庆', '南宁', '银川'],
             '石家庄': ['武汉',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '南昌',
              '广州',
              '长沙',
              '太原',
              '西安',
              '北京',
              '天津',
              '呼和浩特'],
             '拉萨': ['兰州', '嘉峪关', '西宁', '成都', '贵阳', '重庆', '南宁', '银川'],
             '贵阳': ['兰州', '西宁', '成都', '拉萨', '西安', '重庆', '南宁', '银川'],
             '武汉': ['石家庄',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '杭州',
              '南昌',
              '福州',
              '广州',
              '长沙',
              '太原',
              '西安',
              '北京',
              '天津',
              '呼和浩特',
              '香港',
              '澳门'],
             '郑州': ['石家庄',
              '武汉',
              '济南',
              '南京',
              '合肥',
              '南昌',
              '广州',
              '长沙',
              '太原',
              '西安',
              '北京',
              '天津',
              '呼和浩特',
              '香港',
              '澳门'],
             '济南': ['石家庄',
              '武汉',
              '郑州',
              '南京',
              '合肥',
              '杭州',
              '南昌',
              '福州',
              '长沙',
              '太原',
              '北京',
              '上海',
              '天津',
              '呼和浩特'],
             '南京': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '合肥',
              '杭州',
              '南昌',
              '福州',
              '长沙',
              '北京',
              '上海',
              '天津'],
             '合肥': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '南京',
              '杭州',
              '南昌',
              '福州',
              '广州',
              '长沙',
              '太原',
              '北京',
              '上海',
              '天津',
              '香港',
              '澳门'],
             '杭州': ['武汉', '济南', '南京', '合肥', '南昌', '福州', '北京', '上海', '天津'],
             '南昌': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '杭州',
              '福州',
              '广州',
              '长沙',
              '太原',
              '北京',
              '上海',
              '天津',
              '香港',
              '澳门'],
             '福州': ['武汉',
              '济南',
              '南京',
              '合肥',
              '杭州',
              '南昌',
              '广州',
              '上海',
              '香港',
              '澳门'],
             '广州': ['石家庄',
              '武汉',
              '郑州',
              '合肥',
              '南昌',
              '福州',
              '长沙',
              '太原',
              '西安',
              '南宁',
              '香港',
              '澳门'],
             '长沙': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '南昌',
              '广州',
              '太原',
              '西安',
              '北京',
              '天津',
              '呼和浩特',
              '南宁',
              '香港',
              '澳门'],
             '沈阳': ['长春', '哈尔滨', '上海'],
             '长春': ['沈阳', '哈尔滨'],
             '哈尔滨': ['沈阳', '长春'],
             '太原': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '合肥',
              '南昌',
              '广州',
              '长沙',
              '西安',
              '北京',
              '天津',
              '呼和浩特',
              '银川',
              '澳门'],
             '西安': ['兰州',
              '成都',
              '石家庄',
              '贵阳',
              '武汉',
              '郑州',
              '广州',
              '长沙',
              '太原',
              '重庆',
              '呼和浩特',
              '南宁',
              '银川'],
             '北京': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '杭州',
              '南昌',
              '长沙',
              '太原',
              '天津',
              '呼和浩特'],
             '上海': ['济南', '南京', '合肥', '杭州', '南昌', '福州', '沈阳', '天津'],
             '重庆': ['兰州', '西宁', '成都', '拉萨', '贵阳', '西安', '呼和浩特', '南宁', '银川'],
             '天津': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '南京',
              '合肥',
              '杭州',
              '南昌',
              '长沙',
              '太原',
              '北京',
              '上海',
              '呼和浩特'],
             '呼和浩特': ['石家庄',
              '武汉',
              '郑州',
              '济南',
              '长沙',
              '太原',
              '西安',
              '北京',
              '重庆',
              '天津',
              '银川'],
             '南宁': ['兰州',
              '成都',
              '拉萨',
              '贵阳',
              '广州',
              '长沙',
              '西安',
              '重庆',
              '银川',
              '香港',
              '澳门'],
             '银川': ['兰州',
              '西宁',
              '成都',
              '拉萨',
              '贵阳',
              '太原',
              '西安',
              '重庆',
              '呼和浩特',
              '南宁'],
             '香港': ['武汉', '郑州', '合肥', '南昌', '福州', '广州', '长沙', '南宁', '澳门'],
             '澳门': ['武汉',
              '郑州',
              '合肥',
              '南昌',
              '福州',
              '广州',
              '长沙',
              '太原',
              '南宁',
              '香港']})

Draw connection graph

1	cities_connection_graph = nx.Graph(cities_connection)

1	nx.draw(cities_connection_graph,city_info,with_labels = True, node_size=10)

BFS 1 version

def search_1(graph, start, destination):
    pathes = [[start]] # list 用来存储待搜索路径
    visited = set() #set 用来存车已搜索节点
    
    
    while pathes:
        path= pathes.pop(0) #提取第一条路径
        froniter = path[-1] #提取即将要探索的节点
        
        if froniter in visited: continue #检查如果该点已经探索过，则不用探索
        
        successors = graph[froniter]
        for city in successors: # 遍历子节点
            if city in path: continue # 检查会不会形成环
            
            new_path = path +[city]
            pathes.append(new_path) #bfs #将新路径加到list里面
            # pathes = [new_path] + pathes #dfs
            if city ==destination: #检查是否到达目的地
                return new_path
        visited.add(froniter)

1	search_1(cities_connection,"上海","石家庄")

['上海', '济南', '石家庄']

Optimal search using variation of BFS

def search_2(graph,start,destination,search_strategy):
    pathes = [[start]]
    visited =set() # !
    while pathes:
        path = pathes.pop(0)
        froniter =path[-1]
        if froniter in visited:continue # !
        
        if froniter == destination:
            return path
        successors = graph[froniter]
        
        for city in successors:
            if city in path:continue #check loop
            
            new_path = path + [city]
            pathes.append(new_path)
        
        pathes =search_strategy(pathes)
        visited.add(froniter) # !
        #

def sort_by_distance(pathes):
    def get_distance_of_path(path):
        distance = 0
        for i,_ in enumerate(path[:-1]):
            distance += get_city_distance(path[i],path[i+1])
        return distance
    return sorted(pathes,key =get_distance_of_path)

def get_distance_of_path(path):
        distance = 0
        for i,_ in enumerate(path[:-1]):
            distance += get_city_distance(path[i],path[i+1])
        return distance

1	get_distance_of_path(["北京","济南","上海"])

752.66259009181

1	search_2(cities_connection,"北京","上海",search_strategy=lambda x:x)

['北京', '济南', '上海']

1	search_2(cities_connection,"北京","上海",search_strategy=sort_by_distance)

['北京', '天津', '上海']

Machine Learning

1	from sklearn.datasets import load_boston

1	dataset = load_boston()

#dataset

1	x,y=dataset['data'],dataset['target']

x.shape

(506, 13)

y.shape

(506,)

1	x[1].shape

(13,)

x[1]

array([2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
       6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
       1.7800e+01, 3.9690e+02, 9.1400e+00])

1	dataset.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

1	dataset['DESCR']

".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n"

1	X_rm = x[:,5]

1 2	# plot the RM with respect to y plt.scatter(X_rm,y)

<matplotlib.collections.PathCollection at 0x11955a9f148>

Gradient descent

Assume that the target function is a linear function

$y = k*rm +b$

1
2
3

# define target function
def price(rm,k,b):
    return k*rm + b

Define mean square loss

$loss = \frac{1}{n} \sum{(y_i - \hat{y_i})}^2$ $loss = \frac{1}{n}\sum{(y_i) - (kx_i + b_i))}^2$

1
2
3

# define loss function
def loss(y,y_hat):
    return sum((y_i - y_hat_i)**2 for y_i,y_hat_i in zip(list(y),list(y_hat)))/len(list(y))

Define partial derivatives

$\frac {\partial{loss}}{\partial{k}} = -\frac{2}{n}\sum(y_i -\hat{y_i})x_i$ $\frac{\partial{loss}}{\partial{b}} = -\frac{2}{n}\sum(y_i - \hat{y_i})$

#define partial derivative
def partial_derivative_k(x,y,y_hat):
    n = len(y)
    gradient = 0
    for x_i,y_i,y_hat_i in zip(list(x),list(y),list(y_hat)):
        gradient += (y_i - y_hat_i) * x_i
    return -2/n * gradient

def partial_derivative_b(y,y_hat):
    n =len(y)
    gradient = 0
    for y_i,y_hat_i in zip(list(y),list(y_hat)):
        gradient+= (y_i-y_hat_i)
    return -2/n* gradient

import random
#initialized parameters
k= random.random()*200-100 # -100 100
b= random.random()*200-100 # -100 100

learning_rate = 1e-3

iteration_num =200
losses = []
for i in range(iteration_num):
    price_use_current_parameters = [price(r,k,b) for r in X_rm] # \hat{y}
    
    current_loss = loss(y,price_use_current_parameters)
    losses.append(current_loss)
    #print("Iteration {},the loss is {},parameters k is {} and b is {}".format(i,current_loss,k,b))
    
    k_gradient = partial_derivative_k(X_rm,y,price_use_current_parameters)
    b_gradient = partial_derivative_b(y,price_use_current_parameters)
    
    k = k + (-1 * k_gradient) * learning_rate
    b = b + (-1 * b_gradient) * learning_rate

best_k =k 
best_b =b
print("best_k is{},best_b is {}".format(best_k,best_b))

best_k is12.41253129958806,best_b is -55.72859329657179

1	plt.plot(list(range(iteration_num)),losses)

[<matplotlib.lines.Line2D at 0x119550b6fc8>]

price_use_best_paramters = [price(r,best_k,best_b) for r in X_rm]

plt.scatter(X_rm,y)
plt.scatter(X_rm,price_use_current_parameters)

<matplotlib.collections.PathCollection at 0x11955a2d188>

1
2