In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [4]:
data_filename ='data/nyc_data.csv'
fare_filename ='data/nyc_fare.csv'
In [6]:
 
data = pd.read_csv(data_filename, parse_dates=['pickup_datetime', 'dropoff_datetime'])
fare=pd.read_csv(fare_filename, parse_dates=['pickup_datetime'])
data.head(3)
Out[6]:
medallion hack_license vendor_id rate_code store_and_fwd_flag pickup_datetime dropoff_datetime passenger_count trip_time_in_secs trip_distance pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
0 76942C3205E17D7E7FE5A9F709D16434 25BA06A87905667AA1FE5990E33F0E2E VTS 1 NaN 2013-01-01 00:00:00 2013-01-01 00:05:00 3 300 0.61 -73.955925 40.781887 -73.963181 40.777832
1 517C6B330DBB3F055D007B07512628B3 2C19FBEE1A6E05612EFE4C958C14BC7F VTS 1 NaN 2013-01-01 00:05:00 2013-01-01 00:21:00 1 960 3.28 -74.005501 40.745735 -73.964943 40.755722
2 ED15611F168E41B33619C83D900FE266 754AEBD7C80DA17BA1D81D89FB6F4D1D CMT 1 N 2013-01-01 00:05:52 2013-01-01 00:12:18 1 386 1.50 -73.969955 40.799770 -73.954567 40.787392
In [8]:
data.describe() 
Out[8]:
rate_code passenger_count trip_time_in_secs trip_distance pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
count 846945.000000 846945.000000 846945.000000 846945.000000 846945.000000 846945.000000 846945.000000 846945.000000
mean 1.026123 1.710272 812.523879 9.958211 -73.975155 40.750490 -73.974197 40.750967
std 0.223480 1.375266 16098.305145 6525.204888 0.035142 0.027224 0.033453 0.030766
min 0.000000 0.000000 -10.000000 0.000000 -74.098305 40.009911 -74.099998 40.009911
25% 1.000000 1.000000 361.000000 1.050000 -73.992371 40.736031 -73.991570 40.735207
50% 1.000000 1.000000 600.000000 1.800000 -73.982094 40.752975 -73.980614 40.753597
75% 1.000000 2.000000 960.000000 3.200000 -73.968048 40.767460 -73.965157 40.768227
max 6.000000 6.000000 4294796.000000 6005123.000000 -73.028473 40.996132 -73.027061 40.998592
In [10]:
data.columns
Out[10]:
Index(['medallion', 'hack_license', 'vendor_id', 'rate_code',
       'store_and_fwd_flag', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'trip_time_in_secs', 'trip_distance',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude'],
      dtype='object')
In [14]:
p_lng = data.pickup_longitude
p_lat = data.pickup_latitude
d_lng = data.dropoff_longitude
d_lat = data.dropoff_latitude
In [17]:
p_lng
Out[17]:
0        -73.955925
1        -74.005501
2        -73.969955
3        -73.991432
4        -73.966225
5        -73.955238
6        -73.985580
7        -73.999413
8        -73.992180
9        -74.006554
10       -73.972473
11       -74.007263
12       -73.984589
13       -73.987991
14       -74.006111
15       -73.988693
16       -73.982094
17       -73.976822
18       -73.978676
19       -73.986336
20       -73.969917
21       -73.954521
22       -74.002953
23       -74.006683
24       -73.974174
25       -73.979942
26       -73.970970
27       -73.988281
28       -73.997452
29       -73.981506
            ...    
846915   -74.010689
846916   -73.982697
846917   -73.977768
846918   -74.016182
846919   -73.951698
846920   -73.951981
846921   -73.987846
846922   -73.984886
846923   -73.953316
846924   -73.987709
846925   -73.969978
846926   -73.985458
846927   -73.982651
846928   -73.996460
846929   -73.972366
846930   -73.980141
846931   -73.987236
846932   -74.005722
846933   -73.970688
846934   -73.993156
846935   -73.952888
846936   -73.927185
846937   -73.974609
846938   -73.977196
846939   -73.973480
846940   -73.992058
846941   -73.994949
846942   -73.993492
846943   -73.978477
846944   -73.987206
Name: pickup_longitude, dtype: float64
In [19]:
def lat_lng_to_pixels(lat, lng):
    lat_rad = lat*np.pi / 180.0
    lat_rad = np.log(np.tan((lat_rad + np.pi / 2.0) / 2.0))
    x = 100 * (lng + 180.0) / 360.0
    y= 100 * (lat_rad - np.pi) / (2.0 * np.pi)
    return(x,y)
In [21]:
 
px, py = lat_lng_to_pixels(p_lat, p_lng)
In [23]:
px.head(13)
Out[23]:
0     29.456688
1     29.442916
2     29.452790
3     29.446824
4     29.453826
5     29.456878
6     29.448450
7     29.444608
8     29.446617
9     29.442624
10    29.452091
11    29.442427
12    29.448725
Name: pickup_longitude, dtype: float64
In [25]:
 
plt.scatter(px, py)
Out[25]:
<matplotlib.collections.PathCollection at 0x17707a58>
In [33]:
 
import networkx as nx
graph = nx.read_edgelist('3980.edges')
len(graph.nodes())
len(graph.edges())
nx.draw(graph)
plt.show()