changeset 1238:b684135d817f

version 1 of dltrack without coordinate projection
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Tue, 03 Oct 2023 16:51:39 -0400
parents 31a441efca6c
children 31173c4699d2
files scripts/compute-clearmot.py scripts/dltrack.py scripts/undistort-video.py
diffstat 3 files changed, 113 insertions(+), 110 deletions(-) [+]
line wrap: on
line diff
--- a/scripts/compute-clearmot.py	Mon Oct 02 16:51:43 2023 -0400
+++ b/scripts/compute-clearmot.py	Tue Oct 03 16:51:39 2023 -0400
@@ -20,7 +20,7 @@
 parser.add_argument('-g', dest = 'groundTruthDatabaseFilename', help = 'name of the Sqlite database containing the ground truth', required = True)
 parser.add_argument('-o', dest = 'homographyFilename', help = 'name of the filename for the homography (if tracking was done using the homography)')
 parser.add_argument('-m', dest = 'matchingDistance', help = 'matching distance between tracker and ground truth trajectories', required = True, type = float)
-parser.add_argument('--mask', dest = 'maskFilename', help = 'filename of the mask file used to define the where objects were tracked')
+parser.add_argument('-k', dest = 'maskFilename', help = 'filename of the mask file used to define the where objects were tracked')
 parser.add_argument('-f', dest = 'firstInstant', help = 'first instant for measurement', required = True, type = int)
 parser.add_argument('-l', dest = 'lastInstant', help = 'last instant for measurement', required = True, type = int)
 parser.add_argument('--offset', dest = 'nFramesOffsetAnnotations', help = 'number of frames to offset the ground truth annotations', type = int)
--- a/scripts/dltrack.py	Mon Oct 02 16:51:43 2023 -0400
+++ b/scripts/dltrack.py	Tue Oct 03 16:51:39 2023 -0400
@@ -12,63 +12,29 @@
 
 from trafficintelligence import cvutils, moving, storage, utils
 
-parser = argparse.ArgumentParser(description='The program tracks objects following the ultralytics yolo executable.')#, epilog = 'Either the configuration filename or the other parameters (at least video and database filenames) need to be provided.')
+parser = argparse.ArgumentParser(description='The program tracks objects using the ultralytics models and trakcers.')#, epilog = 'Either the configuration filename or the other parameters (at least video and database filenames) need to be provided.')
 parser.add_argument('-i', dest = 'videoFilename', help = 'name of the video file', required = True)
 parser.add_argument('-d', dest = 'databaseFilename', help = 'name of the Sqlite database file', required = True)
 parser.add_argument('-m', dest = 'detectorFilename', help = 'name of the detection model file', required = True)
 parser.add_argument('-t', dest = 'trackerFilename', help = 'name of the tracker file', required = True)
-parser.add_argument('--display', dest = 'display', help = 'show the results (careful with long videos, risk of running out of memory)', action = 'store_true')
+parser.add_argument('-o', dest = 'homographyFilename', help = 'filename of the homography matrix', default = 'homography.txt')
+parser.add_argument('-k', dest = 'maskFilename', help = 'name of the mask file')
+parser.add_argument('--undistort', dest = 'undistort', help = 'undistort the video', action = 'store_true')
+parser.add_argument('--intrinsic', dest = 'intrinsicCameraMatrixFilename', help = 'name of the intrinsic camera file')
+parser.add_argument('--distortion-coefficients', dest = 'distortionCoefficients', help = 'distortion coefficients', nargs = '*', type = float)
+parser.add_argument('--display', dest = 'display', help = 'show the raw detection and tracking results', action = 'store_true')
 parser.add_argument('-f', dest = 'firstFrameNum', help = 'number of first frame number to process', type = int, default = 0)
 parser.add_argument('-l', dest = 'lastFrameNum', help = 'number of last frame number to process', type = int, default = float('Inf'))
 parser.add_argument('--bike-prop', dest = 'bikeProportion', help = 'minimum proportion of time a person classified as bike or motorbike to be classified as cyclist', type = float, default = 0.2)
 parser.add_argument('--cyclist-iou', dest = 'cyclistIou', help = 'IoU threshold to associate a bike and ped bounding box', type = float, default = 0.15)
 parser.add_argument('--cyclist-match-prop', dest = 'cyclistMatchingProportion', help = 'minimum proportion of time a bike exists and is associated with a pedestrian to be merged as cyclist', type = float, default = 0.3)
-# mask!!
+parser.add_argument('--max-temp-overal', dest = 'maxTemporalOverlap', help = 'maximum proportion of time to merge 2 bikes associated with same pedestrian', type = float, default = 0.05)
 args = parser.parse_args()
 
-# required functionality?
-# # filename of the video to process (can be images, eg image%04d.png)
-# video-filename = laurier.avi
-# # filename of the database where results are saved
-# database-filename = laurier.sqlite
-# # filename of the homography matrix
-# homography-filename = laurier-homography.txt
-# # filename of the camera intrinsic matrix
-# intrinsic-camera-filename = intrinsic-camera.txt
-# # -0.11759321 0.0148536 0.00030756 -0.00020578 -0.00091816
-# distortion-coefficients = -0.11759321
-# distortion-coefficients = 0.0148536
-# distortion-coefficients = 0.00030756 
-# distortion-coefficients = -0.00020578 
-# distortion-coefficients = -0.00091816
-# # undistorted image multiplication
-# undistorted-size-multiplication = 1.31
-# # Interpolation method for remapping image when correcting for distortion: 0 for INTER_NEAREST - a nearest-neighbor interpolation; 1 for INTER_LINEAR - a bilinear interpolation (used by default); 2 for INTER_CUBIC - a bicubic interpolation over 4x4 pixel neighborhood; 3 for INTER_LANCZOS4
-# interpolation-method = 1
-# # filename of the mask image (where features are detected)
-# mask-filename = none
-# # undistort the video for feature tracking
-# undistort = false
-# # load features from database
-# load-features = false
-# # display trajectories on the video
-# display = false
-# # original video frame rate (number of frames/s)
-# video-fps = 29.97
-# # number of digits of precision for all measurements derived from video
-# # measurement-precision = 3
-# # first frame to process
-# frame1 = 0
-# # number of frame to process: 0 means processing all frames
-# nframes = 0
-
 # TODO add option to refine position with mask for vehicles
 
 # use 2 x bytetrack track buffer to remove objects from existing ones
 
-
-# check if one can go to specific frame https://docs.ultralytics.com/modes/track/#persisting-tracks-loop
-
 # Load a model
 model = YOLO(args.detectorFilename) # seg yolov8x-seg.pt
 # seg could be used on cropped image... if can be loaded and kept in memory
@@ -80,11 +46,8 @@
     cv2.namedWindow(windowName, cv2.WINDOW_NORMAL)
 
 capture = cv2.VideoCapture(args.videoFilename)
-#results = model.track(source=args.videoFilename, tracker="/home/nicolas/Research/Data/classification-models/bytetrack.yaml", classes=list(moving.cocoTypeNames.keys()), stream=True)
-objects = []
-currentObjects = {}
-featureNum = 0
-
+objects = {}
+featureNum = 1
 frameNum = args.firstFrameNum
 capture.set(cv2.CAP_PROP_POS_FRAMES, frameNum)
 lastFrameNum = args.lastFrameNum
@@ -95,27 +58,30 @@
 while capture.isOpened() and success and frameNum <= lastFrameNum:
 #for frameNum, result in enumerate(results):
     result = results[0]
-    print(frameNum, len(result.boxes), 'objects')
+    if frameNum %10 == 0:
+        print(frameNum, len(result.boxes), 'objects')
     for box in result.boxes:
         #print(box.cls, box.id, box.xyxy)
         if box.id is not None: # None are objects with low confidence
             num = int(box.id.item())
             #xyxy = box.xyxy[0].tolist()
-            if num in currentObjects:
-                currentObjects[num].timeInterval.last = frameNum
-                currentObjects[num].bboxes[frameNum] = copy(box.xyxy)
-                currentObjects[num].userTypes.append(moving.coco2Types[int(box.cls.item())])
-                currentObjects[num].features[0].tmpPositions[frameNum] = moving.Point(box.xyxy[0,0].item(), box.xyxy[0,1].item())
-                currentObjects[num].features[1].tmpPositions[frameNum] = moving.Point(box.xyxy[0,2].item(), box.xyxy[0,3].item())
+            if num in objects:
+                objects[num].timeInterval.last = frameNum
+                objects[num].features[0].timeInterval.last = frameNum
+                objects[num].features[1].timeInterval.last = frameNum
+                objects[num].bboxes[frameNum] = copy(box.xyxy)
+                objects[num].userTypes.append(moving.coco2Types[int(box.cls.item())])
+                objects[num].features[0].tmpPositions[frameNum] = moving.Point(box.xyxy[0,0].item(), box.xyxy[0,1].item())
+                objects[num].features[1].tmpPositions[frameNum] = moving.Point(box.xyxy[0,2].item(), box.xyxy[0,3].item())
             else:
-                inter = moving.TimeInterval(frameNum,frameNum)
-                currentObjects[num] = moving.MovingObject(num, inter)
-                currentObjects[num].bboxes = {frameNum: copy(box.xyxy)}
-                currentObjects[num].userTypes = [moving.coco2Types[int(box.cls.item())]]
-                currentObjects[num].features = [moving.MovingObject(featureNum), moving.MovingObject(featureNum+1)]
-                currentObjects[num].featureNumbers = [featureNum, featureNum+1]
-                currentObjects[num].features[0].tmpPositions = {frameNum: moving.Point(box.xyxy[0,0].item(), box.xyxy[0,1].item())}
-                currentObjects[num].features[1].tmpPositions = {frameNum: moving.Point(box.xyxy[0,2].item(), box.xyxy[0,3].item())}
+                inter = moving.TimeInterval(frameNum, frameNum)
+                objects[num] = moving.MovingObject(num, inter)
+                objects[num].bboxes = {frameNum: copy(box.xyxy)}
+                objects[num].userTypes = [moving.coco2Types[int(box.cls.item())]]
+                objects[num].features = [moving.MovingObject(featureNum, copy(inter)), moving.MovingObject(featureNum+1, copy(inter))]
+                objects[num].featureNumbers = [featureNum, featureNum+1]
+                objects[num].features[0].tmpPositions = {frameNum: moving.Point(box.xyxy[0,0].item(), box.xyxy[0,1].item())}
+                objects[num].features[1].tmpPositions = {frameNum: moving.Point(box.xyxy[0,2].item(), box.xyxy[0,3].item())}
                 featureNum += 2
     if args.display:
         cvutils.cvImshow(windowName, result.plot()) # original image in orig_img
@@ -127,73 +93,110 @@
     results = model.track(frame, persist=True)
 
 # classification
-for num, obj in currentObjects.items():
-    #obj.setUserType(utils.mostCommon(obj.userTypes)) # improve? mix with speed?
-    userTypeStats = Counter(obj.userTypes)
-    if (4 in userTypeStats or (3 in userTypeStats and 4 in userTypeStats and userTypeStats[3]<=userTypeStats[4])) and userTypeStats[3]+userTypeStats[4] > args.bikeProportion*userTypeStats.total(): # 3 is motorcycle and 4 is cyclist (verif if not turning all motorbike into cyclists)
-        obj.setUserType(4)
-    else:
-        obj.setUserType(userTypeStats.most_common()[0][0])
+for num, obj in objects.items():
+    obj.setUserType(utils.mostCommon(obj.userTypes)) # improve? mix with speed?
 
+# add quality control: avoid U-turns
+    
 # merge bikes and people
-twowheels = [num for num, obj in currentObjects.items() if obj.getUserType() in (3,4)]
-pedestrians = [num for num, obj in currentObjects.items() if obj.getUserType() == 2]
+twowheels = [num for num, obj in objects.items() if obj.getUserType() in (3,4)]
+pedestrians = [num for num, obj in objects.items() if obj.getUserType() == 2]
 
+def mergeObjects(obj1, obj2):
+    obj1.features = obj1.features+obj2.features
+    obj1.featureNumbers = obj1.featureNumbers+obj2.featureNumbers
+    obj1.timeInterval = moving.TimeInterval(min(obj1.getFirstInstant(), obj2.getFirstInstant()), max(obj1.getLastInstant(), obj2.getLastInstant()))    
+    
 costs = []
 for twInd in twowheels:
-    tw = currentObjects[twInd]
+    tw = objects[twInd]
+    tw.nBBoxes = len(tw.bboxes)
     twCost = []
     for pedInd in pedestrians:
-        ped = currentObjects[pedInd]
+        ped = objects[pedInd]
         nmatches = 0
         for t in tw.bboxes:
             if t in ped.bboxes:
                 #print(tw.num, ped.num, t, box_iou(tw.bboxes[t], ped.bboxes[t]))
-                if box_iou(tw.bboxes[t], ped.bboxes[t]).item() > args.cyclistIou:
+                if not tw.commonTimeInterval(ped).empty() and box_iou(tw.bboxes[t], ped.bboxes[t]).item() > args.cyclistIou:
                     nmatches += 1
-        twCost.append(nmatches/len(tw.bboxes))
+        twCost.append(nmatches/tw.nBBoxes)
     costs.append(twCost)
 
 costs = -np.array(costs)
+
 # before matching, scan for pedestrians with good non-overlapping temporal match with different bikes
-for pedInd in costs.shape[1]:
-    if sum(costs[:,pedInd] < -args.cyclistMatchingProportion) >1:
-        twIndices = np.nonzero(costs[:,pedInd] < -args.cyclistMatchingProportion)
-        # we have to compute temporal overlaps with everyone else, then remove the ones with the most overlap (sum over column) one by one until there is little left
-        temporalOverlaps = np.zeros((len(twIndices),len(twIndices)))
-        
+for pedInd in range(costs.shape[1]):
+    nMatchedBikes = (costs[:,pedInd] < -args.cyclistMatchingProportion).sum()
+    if nMatchedBikes == 0: # peds that have no bike matching: see if they have been classified as bikes sometimes
+        userTypeStats = Counter(obj.userTypes)
+        if (4 in userTypeStats or (3 in userTypeStats and 4 in userTypeStats and userTypeStats[3]<=userTypeStats[4])) and userTypeStats[3]+userTypeStats[4] > args.bikeProportion*userTypeStats.total(): # 3 is motorcycle and 4 is cyclist (verif if not turning all motorbike into cyclists)
+            obj.setUserType(4)
+    elif nMatchedBikes > 1: # try to merge bikes first
+        twIndices = np.nonzero(costs[:,pedInd] < -args.cyclistMatchingProportion)[0]
+        # we have to compute temporal overlaps of all 2 wheels among themselves, then remove the ones with the most overlap (sum over column) one by one until there is little left
+        nTwoWheels = len(twIndices)
+        twTemporalOverlaps = np.zeros((nTwoWheels,nTwoWheels))
+        for i in range(nTwoWheels):
+            for j in range(i):
+                twi = objects[twowheels[twIndices[i]]]
+                twj = objects[twowheels[twIndices[j]]]
+                twTemporalOverlaps[i,j] = len(set(twi.bboxes).intersection(set(twj.bboxes)))/max(len(twi.bboxes), len(twj.bboxes))
+                #twTemporalOverlaps[j,i] = twTemporalOverlaps[i,j]
+        tw2merge = list(range(nTwoWheels))
+        while len(tw2merge)>0 and (twTemporalOverlaps[np.ix_(tw2merge, tw2merge)] > args.maxTemporalOverlap).sum(0).max() >= 2:
+            i = (twTemporalOverlaps[np.ix_(tw2merge, tw2merge)] > args.maxTemporalOverlap).sum(0).argmax()
+            del tw2merge[i]
+        twIndices = [twIndices[i] for i in tw2merge]
+        tw1 = objects[twowheels[twIndices[0]]]
+        twCost = costs[twIndices[0],:]*tw1.nBBoxes
+        nBBoxes = tw1.nBBoxes
+        for twInd in twIndices[1:]:
+            mergeObjects(tw1, objects[twowheels[twInd]])
+            twCost = twCost + costs[twInd,:]*objects[twowheels[twInd]].nBBoxes
+            nBBoxes += objects[twowheels[twInd]].nBBoxes
+        twIndicesToKeep = list(range(costs.shape[0]))
+        for twInd in twIndices[1:]:
+            twIndicesToKeep.remove(twInd)
+            del objects[twowheels[twInd]]
+        twowheels = [twowheels[i] for i in twIndicesToKeep]
+        costs = costs[twIndicesToKeep,:]
 
 twIndices, matchingPedIndices = linear_sum_assignment(costs)
 for twInd, pedInd in zip(twIndices, matchingPedIndices): # caution indices in the cost matrix
     if -costs[twInd, pedInd] >= args.cyclistMatchingProportion:
-        tw = currentObjects[twowheels[twInd]]
-        ped = currentObjects[pedestrians[pedInd]]
-        timeInstants = set(tw.bboxes).union(set(ped.bboxes))
-        for t in timeInstants:
-            if t in tw.bboxes and t in ped.bboxes:
-                tw.features[0].tmpPositions[t] = moving.Point(min(tw.features[0].tmpPositions[t].x, ped.features[0].tmpPositions[t].x),
-                                                              min(tw.features[0].tmpPositions[t].y, ped.features[0].tmpPositions[t].y))
-                tw.features[1].tmpPositions[t] = moving.Point(max(tw.features[1].tmpPositions[t].x, ped.features[1].tmpPositions[t].x),
-                                                              max(tw.features[1].tmpPositions[t].y, ped.features[1].tmpPositions[t].y))
-            elif t in ped.bboxes:
-                tw.features[0].tmpPositions[t] = ped.features[0].tmpPositions[t]
-                tw.features[1].tmpPositions[t] = ped.features[1].tmpPositions[t]
-        tw.timeInterval = moving.TimeInterval(min(tw.getFirstInstant(), ped.getFirstInstant()), max(tw.getLastInstant(), ped.getLastInstant()))
-        del currentObjects[pedestrians[pedInd]]
-#Verif overlap piéton vélo : si long hors overlap, changement mode (trouver exemples)
+        tw = objects[twowheels[twInd]]
+        ped = objects[pedestrians[pedInd]]
+        mergeObjects(tw, ped)
+        del objects[pedestrians[pedInd]]
+        #TODO Verif overlap piéton vélo : si long hors overlap, changement mode (trouver exemples)
 
-# interpolate and generate velocity (?) before saving
-for num, obj in currentObjects.items():
-    obj.features[0].timeInterval = copy(obj.getTimeInterval())
-    obj.features[1].timeInterval = copy(obj.getTimeInterval())
-    if obj.length() != len(obj.features[0].tmpPositions): # interpolate
-        obj.features[0].positions = moving.Trajectory.fromPointDict(obj.features[0].tmpPositions)
-        obj.features[1].positions = moving.Trajectory.fromPointDict(obj.features[1].tmpPositions)
-    else:
-        obj.features[0].positions = moving.Trajectory.fromPointList(list(obj.features[0].tmpPositions.values()))
-        obj.features[1].positions = moving.Trajectory.fromPointList(list(obj.features[1].tmpPositions.values()))
-        
-storage.saveTrajectoriesToSqlite(args.databaseFilename, list(currentObjects.values()), 'object')
+# interpolate and generate velocity (?) for the features (bboxes) before saving
+for num, obj in objects.items():
+    #obj.features[1].timeInterval = copy(obj.getTimeInterval())
+    for f in obj.getFeatures():
+        if f.length() != len(f.tmpPositions): # interpolate
+            f.positions = moving.Trajectory.fromPointDict(f.tmpPositions)
+            #obj.features[1].positions = moving.Trajectory.fromPointDict(obj.features[1].tmpPositions)
+        else:
+            f.positions = moving.Trajectory.fromPointList(list(f.tmpPositions.values()))
+            #obj.features[1].positions = moving.Trajectory.fromPointList(list(obj.features[1].tmpPositions.values()))
+
+storage.saveTrajectoriesToSqlite(args.databaseFilename, list(objects.values()), 'object')
 
 # todo save bbox and mask to study localization / representation
 # apply quality checks deviation and acceleration bounds?
+
+# def mergeBBoxes(tw, ped):
+#     'merges ped into tw (2nd obj into first obj)'
+#     timeInstants = set(tw.bboxes).union(set(ped.bboxes))
+#     for t in timeInstants:
+#         if t in tw.bboxes and t in ped.bboxes:
+#             tw.features[0].tmpPositions[t] = moving.Point(min(tw.features[0].tmpPositions[t].x, ped.features[0].tmpPositions[t].x),
+#                                                           min(tw.features[0].tmpPositions[t].y, ped.features[0].tmpPositions[t].y))
+#             tw.features[1].tmpPositions[t] = moving.Point(max(tw.features[1].tmpPositions[t].x, ped.features[1].tmpPositions[t].x),
+#                                                           max(tw.features[1].tmpPositions[t].y, ped.features[1].tmpPositions[t].y))
+#         elif t in ped.bboxes:
+#             tw.features[0].tmpPositions[t] = ped.features[0].tmpPositions[t]
+#             tw.features[1].tmpPositions[t] = ped.features[1].tmpPositions[t]
+#     tw.timeInterval = moving.TimeInterval(min(tw.getFirstInstant(), ped.getFirstInstant()), max(tw.getLastInstant(), ped.getLastInstant()))
--- a/scripts/undistort-video.py	Mon Oct 02 16:51:43 2023 -0400
+++ b/scripts/undistort-video.py	Tue Oct 03 16:51:39 2023 -0400
@@ -16,7 +16,7 @@
 parser.add_argument('--intrinsic', dest = 'intrinsicCameraMatrixFilename', help = 'name of the intrinsic camera file')
 parser.add_argument('--distortion-coefficients', dest = 'distortionCoefficients', help = 'distortion coefficients', nargs = '*', type = float)
 parser.add_argument('--undistorted-multiplication', dest = 'undistortedImageMultiplication', help = 'undistorted image multiplication', type = float, default = 1.)
-parser.add_argument('--mask', dest = 'maskFilename', help = 'name of the mask file, to undistort to see how it covers the undistortion errors')
+parser.add_argument('-k', dest = 'maskFilename', help = 'name of the mask file, to undistort to see how it covers the undistortion errors')
 parser.add_argument('-f', dest = 'firstFrameNum', help = 'number of first frame number to display', type = int, default = 0)
 parser.add_argument('-l', dest = 'lastFrameNum', help = 'number of last frame number to save', type = int)
 parser.add_argument('-d', dest = 'destinationDirname', help = 'name of the directory where the undistorted frames are saved')