Mercurial Hosting > traffic-intelligence
diff scripts/process.py @ 1083:5b597b021aed
added function to aggregate interactions
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Mon, 23 Jul 2018 20:17:27 -0400 |
parents | 706034a4c6cd |
children | 1a7e0b2c858b |
line wrap: on
line diff
--- a/scripts/process.py Sun Jul 22 08:50:28 2018 -0400 +++ b/scripts/process.py Mon Jul 23 20:17:27 2018 -0400 @@ -24,7 +24,7 @@ parser.add_argument('--process', dest = 'process', help = 'data to process', choices = ['feature', 'object', 'classification', 'prototype', 'interaction']) parser.add_argument('--display', dest = 'display', help = 'data to display (replay over video)', choices = ['feature', 'object', 'classification', 'interaction']) parser.add_argument('--progress', dest = 'progress', help = 'information about the progress of processing', action = 'store_true') -parser.add_argument('--analyze', dest = 'analyze', help = 'data to analyze (results)', choices = ['feature', 'object', 'classification', 'interaction', 'event']) +parser.add_argument('--analyze', dest = 'analyze', help = 'data to analyze (results)', choices = ['feature', 'object', 'classification', 'interaction', 'event-speed', 'event-interaction']) # common options parser.add_argument('--cfg', dest = 'configFilename', help = 'name of the configuration file') @@ -335,8 +335,8 @@ elif args.output == 'event': data.to_csv(args.eventFilename, index = False) -if args.analyze == 'event': # aggregate event data by 15 min interval (args.intervalDuration), count events with thresholds - data = pd.read_csv(args.eventFilename, parse_dates = [2]) +if args.analyze == 'event-speed': # aggregate event data by 15 min interval (args.intervalDuration), count events with thresholds + data = pd.read_csv(args.eventFilename, parse_dates = [2], nrows = 10000) #data = pd.read_csv('./speeds.csv', converters = {'time': lambda s: datetime.datetime.strptime(s, "%H:%M:%S").time()}, nrows = 5000) # create time for end of each 15 min, then group by, using the agg method for each data column headers = ['site', 'date', 'intervalend15', 'duration', 'count'] @@ -345,9 +345,10 @@ for h in dataColumns: for h2 in tmpheaders: headers.append(h+'-'+h2) - for h in dataColumns: - for t in args.eventThresholds: - headers.append('n-{}-{}'.format(h, t)) + if args.eventThresholds is not None: + for h in dataColumns: + for t in args.eventThresholds: + headers.append('n-{}-{}'.format(h, t)) data['intervalend15'] = data.time.apply(lambda t: (pd.Timestamp(year = t.year, month = t.month, day = t.day,hour = t.hour, minute = (t.minute // args.intervalDuration)*args.intervalDuration)+pd.Timedelta(minutes = 15)).time()) outputData = [] for name, group in data.groupby(['site', 'date', 'intervalend15']): @@ -364,8 +365,50 @@ row.extend(aggregated) else: row.append(aggregated) - for h in dataColumns: - for t in args.eventThresholds: - row.append((group[h] > t).sum()) + if args.eventThresholds is not None: + for h in dataColumns: + for t in args.eventThresholds: + row.append((group[h] > t).sum()) outputData.append(row) pd.DataFrame(outputData, columns = headers).to_csv(utils.removeExtension(args.eventFilename)+'-aggregated.csv', index = False) + +elif args.analyze == 'event-interaction': # aggregate event data by 15 min interval (args.intervalDuration), count events with thresholds + data = pd.read_csv(args.eventFilename, parse_dates = [2], nrows = 20000) + headers = ['site', 'date', 'intervalend15', 'duration', 'count'] + aggFunctions, tmpheaders = utils.aggregationMethods(args.aggMethods, args.aggCentiles) + dataColumns = list(data.columns[3:]) + for h in dataColumns: + if not 'speed' in h.lower(): # proximity indicators are reversed, taking 85th centile of this column will yield the 15th centile (which we have to take the opposite again) + data[h] = -data[h] + for h in dataColumns: + for h2 in tmpheaders: + headers.append(h+'-'+h2) + for h,t in zip(dataColumns, args.eventThresholds): # each threshold in this case applies to one indicator + headers.append('n-{}-{}'.format(h, t)) + data['intervalend15'] = data.time.apply(lambda t: (pd.Timestamp(year = t.year, month = t.month, day = t.day,hour = t.hour, minute = (t.minute // args.intervalDuration)*args.intervalDuration)+pd.Timedelta(minutes = 15)).time()) + outputData = [] + for name, group in data.groupby(['site', 'date', 'intervalend15']): + row = [] + row.extend(name) + groupStartTime = group.time.min() + groupEndTime = group.time.max() + row.append((groupEndTime.minute+1-groupStartTime.minute) % 60)#(name[2].minute*60+name[2].second-groupStartTime.minute*60+groupStartTime.second) % 3600) + row.append(len(group)) + for h in dataColumns: + for method,func in aggFunctions.items(): + tmp = group.loc[~group[h].isna(), h] + if len(tmp)>0: + aggregated = func(tmp) # todo invert if the resulting stat is negative + if method == 'centile': + row.extend(np.abs(aggregated)) + else: + row.append(np.abs(aggregated)) + else: + row.extend([None]*len(aggFunctions)) + for h,t in zip(dataColumns, args.eventThresholds): # each threshold in this case applies to one indicator + if 'speed' in h.lower(): + row.append((group[h] > t).sum()) + else: + row.append((group[h] > -t).sum()) # take larger than than negative threshold for proximity indicators + outputData.append(row) + pd.DataFrame(outputData, columns = headers).to_csv(utils.removeExtension(args.eventFilename)+'-aggregated.csv', index = False)