Mercurial Hosting > traffic-intelligence

--- a/scripts/process.py	Sun Jul 22 08:50:28 2018 -0400
+++ b/scripts/process.py	Mon Jul 23 20:17:27 2018 -0400
@@ -24,7 +24,7 @@
 parser.add_argument('--process', dest = 'process', help = 'data to process', choices = ['feature', 'object', 'classification', 'prototype', 'interaction'])
 parser.add_argument('--display', dest = 'display', help = 'data to display (replay over video)', choices = ['feature', 'object', 'classification', 'interaction'])
 parser.add_argument('--progress', dest = 'progress', help = 'information about the progress of processing', action = 'store_true')
-parser.add_argument('--analyze', dest = 'analyze', help = 'data to analyze (results)', choices = ['feature', 'object', 'classification', 'interaction', 'event'])
+parser.add_argument('--analyze', dest = 'analyze', help = 'data to analyze (results)', choices = ['feature', 'object', 'classification', 'interaction', 'event-speed', 'event-interaction'])

 # common options
 parser.add_argument('--cfg', dest = 'configFilename', help = 'name of the configuration file')
@@ -335,8 +335,8 @@
     elif args.output == 'event':
         data.to_csv(args.eventFilename, index = False)

-if args.analyze == 'event': # aggregate event data by 15 min interval (args.intervalDuration), count events with thresholds
-    data = pd.read_csv(args.eventFilename, parse_dates = [2])
+if args.analyze == 'event-speed': # aggregate event data by 15 min interval (args.intervalDuration), count events with thresholds
+    data = pd.read_csv(args.eventFilename, parse_dates = [2], nrows = 10000)
     #data = pd.read_csv('./speeds.csv', converters = {'time': lambda s: datetime.datetime.strptime(s, "%H:%M:%S").time()}, nrows = 5000)
     # create time for end of each 15 min, then group by, using the agg method for each data column
     headers = ['site', 'date', 'intervalend15', 'duration', 'count']
@@ -345,9 +345,10 @@
     for h in dataColumns:
         for h2 in tmpheaders:
             headers.append(h+'-'+h2)
-    for h in dataColumns:
-        for t in args.eventThresholds:
-            headers.append('n-{}-{}'.format(h, t))
+    if args.eventThresholds is not None:
+        for h in dataColumns:
+            for t in args.eventThresholds:
+                headers.append('n-{}-{}'.format(h, t))
     data['intervalend15'] = data.time.apply(lambda t: (pd.Timestamp(year = t.year, month = t.month, day = t.day,hour = t.hour, minute = (t.minute // args.intervalDuration)*args.intervalDuration)+pd.Timedelta(minutes = 15)).time())
     outputData = []
     for name, group in data.groupby(['site', 'date', 'intervalend15']):
@@ -364,8 +365,50 @@
                     row.extend(aggregated)
                 else:
                     row.append(aggregated)
-        for h in dataColumns:
-            for t in args.eventThresholds:
-                row.append((group[h] > t).sum())
+        if args.eventThresholds is not None:
+            for h in dataColumns:
+                for t in args.eventThresholds:
+                    row.append((group[h] > t).sum())
         outputData.append(row)
     pd.DataFrame(outputData, columns = headers).to_csv(utils.removeExtension(args.eventFilename)+'-aggregated.csv', index = False)
+
+elif args.analyze == 'event-interaction': # aggregate event data by 15 min interval (args.intervalDuration), count events with thresholds
+    data = pd.read_csv(args.eventFilename, parse_dates = [2], nrows = 20000)
+    headers = ['site', 'date', 'intervalend15', 'duration', 'count']
+    aggFunctions, tmpheaders = utils.aggregationMethods(args.aggMethods, args.aggCentiles)
+    dataColumns = list(data.columns[3:])
+    for h in dataColumns:
+        if not 'speed' in h.lower(): # proximity indicators are reversed, taking 85th centile of this column will yield the 15th centile (which we have to take the opposite again)
+            data[h] = -data[h]
+    for h in dataColumns:
+        for h2 in tmpheaders:
+            headers.append(h+'-'+h2)
+    for h,t in zip(dataColumns, args.eventThresholds): # each threshold in this case applies to one indicator
+        headers.append('n-{}-{}'.format(h, t))
+    data['intervalend15'] = data.time.apply(lambda t: (pd.Timestamp(year = t.year, month = t.month, day = t.day,hour = t.hour, minute = (t.minute // args.intervalDuration)*args.intervalDuration)+pd.Timedelta(minutes = 15)).time())
+    outputData = []
+    for name, group in data.groupby(['site', 'date', 'intervalend15']):
+        row = []
+        row.extend(name)
+        groupStartTime = group.time.min()
+        groupEndTime = group.time.max()
+        row.append((groupEndTime.minute+1-groupStartTime.minute) % 60)#(name[2].minute*60+name[2].second-groupStartTime.minute*60+groupStartTime.second) % 3600)
+        row.append(len(group))
+        for h in dataColumns:
+            for method,func in aggFunctions.items():
+                tmp = group.loc[~group[h].isna(), h]
+                if len(tmp)>0:
+                    aggregated = func(tmp) # todo invert if the resulting stat is negative
+                    if method == 'centile':
+                        row.extend(np.abs(aggregated))
+                    else:
+                        row.append(np.abs(aggregated))
+                else:
+                    row.extend([None]*len(aggFunctions))
+        for h,t in zip(dataColumns, args.eventThresholds): # each threshold in this case applies to one indicator
+            if 'speed' in h.lower():
+                row.append((group[h] > t).sum())
+            else:
+                row.append((group[h] > -t).sum()) # take larger than than negative threshold for proximity indicators
+        outputData.append(row)
+    pd.DataFrame(outputData, columns = headers).to_csv(utils.removeExtension(args.eventFilename)+'-aggregated.csv', index = False)