我利用前置 TrueDepth 摄像头与 Vision 相结合来识别图像中的点并运行一些测量。我知道视觉坐标已标准化,因此我将视觉标准化点转换为与视图相对应的 CGPoint,然后将它们与 dataOutputSynchronizer 中的深度数据进行匹配以获取 z 值。然后使用相机内在函数,我尝试获取 3D 空间中 2 点之间的距离。
我已成功找到这些点并(我相信)将它们转换为屏幕点。我的想法是,这些 CGPoints 与我在屏幕上点击它们没有什么不同。
我的问题是,即使转换后的 CGPoint 仍然大部分相似(我的手在测试期间确实移动了一点,但基本上与相机保持平面)并且我尝试以相同的方式计算深度位置,深度可以是截然不同 - 尤其是第 2 点。就计算距离而言,深度点 2 似乎更准确(我的手距离相机大约 1 英尺),但它变化很大,仍然不准确。
这是带有相关数据的控制台打印
there are 2 points found
recognized points
[(499.08930909633636, 634.0807711283367), (543.7462849617004, 1061.8824380238852)]
DEPTH POINT 1 = 3.6312041
DEPTH POINT 2 = 0.2998223
there are 2 points found
recognized points
[(498.33644700050354, 681.3769372304281), (602.3667773008347, 1130.4955183664956)]
DEPTH POINT 1 = 3.6276162
DEPTH POINT 2 = 0.560331
这是一些相关代码。
数据输出同步器
func dataOutputSynchronizer(_ synchronizer: AVCaptureDataOutputSynchronizer,
didOutput synchronizedDataCollection: AVCaptureSynchronizedDataCollection) {
var handPoints: [CGPoint] = []
// Read all outputs
guard renderingEnabled,
let syncedDepthData: AVCaptureSynchronizedDepthData =
synchronizedDataCollection.synchronizedData(for: depthDataOutput) as? AVCaptureSynchronizedDepthData,
let syncedVideoData: AVCaptureSynchronizedSampleBufferData =
synchronizedDataCollection.synchronizedData(for: videoDataOutput) as? AVCaptureSynchronizedSampleBufferData else {
// only work on synced pairs
return
}
if syncedDepthData.depthDataWasDropped || syncedVideoData.sampleBufferWasDropped {
return
}
let depthPixelBuffer = syncedDepthData.depthData.depthDataMap
guard let videoPixelBuffer = CMSampleBufferGetImageBuffer(syncedVideoData.sampleBuffer) else {
return
}
// Get the cameraIntrinsics
guard let cameraIntrinsics = syncedDepthData.depthData.cameraCalibrationData?.intrinsicMatrix else {
return
}
let image = CIImage(cvPixelBuffer: videoPixelBuffer)
let handler = VNImageRequestHandler(
cmSampleBuffer: syncedVideoData.sampleBuffer,
orientation: .up,
options: [:]
)
do {
try handler.perform([handPoseRequest])
guard
let results = handPoseRequest.results?.prefix(2),
!results.isEmpty
else {
return
}
var recognizedPoints: [VNRecognizedPoint] = []
try results.forEach { observation in
let fingers = try observation.recognizedPoints(.all)
if let middleTipPoint = fingers[.middleDIP] {
recognizedPoints.append(middleTipPoint)
}
if let wristPoint = fingers[.wrist] {
recognizedPoints.append(wristPoint)
}
}
// Store the Points in handPoints if they are confident points
handPoints = recognizedPoints.filter {
$0.confidence > 0.90
}
.map {
// Adjust the Y
CGPoint(x: $0.location.x, y: 1 - $0.location.y)
}
// Process the Points Found
DispatchQueue.main.sync {
self.processPoints(handPoints,depthPixelBuffer,videoPixelBuffer,cameraIntrinsics)
}
} catch {
// Be more graceful here
}
}
流程点
func processPoints(_ handPoints: [CGPoint],_ depthPixelBuffer: CVImageBuffer,_ videoPixelBuffer: CVImageBuffer,_ cameraIntrinsics: simd_float3x3) {
// This converts the normalized point to screen points
// cameraView.previewLayer is a AVCaptureVideoPreviewLayer inside a UIView
let convertedPoints = handPoints.map {
cameraView.previewLayer.layerPointConverted(fromCaptureDevicePoint: $0)
}
// We need 2 hand points to get the distance
if handPoints.count == 2 {
print("there are 2 points found");
print("recognized points")
print(convertedPoints)
let handVisionPoint1 = convertedPoints[0]
let handVisionPoint2 = convertedPoints[1]
let scaleFactor = CGFloat(CVPixelBufferGetWidth(depthPixelBuffer)) / CGFloat(CVPixelBufferGetWidth(videoPixelBuffer))
CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
let floatBuffer = unsafeBitCast(CVPixelBufferGetBaseAddress(depthPixelBuffer), to: UnsafeMutablePointer<Float32>.self)
let width = CVPixelBufferGetWidth(depthPixelBuffer)
let height = CVPixelBufferGetHeight(depthPixelBuffer)
let handVisionPixelX = Int((handVisionPoint1.x * scaleFactor).rounded())
let handVisionPixelY = Int((handVisionPoint1.y * scaleFactor).rounded())
let handVisionPixe2X = Int((handVisionPoint2.x * scaleFactor).rounded())
let handVisionPixe2Y = Int((handVisionPoint2.y * scaleFactor).rounded())
CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
let rowDataPoint1 = CVPixelBufferGetBaseAddress(depthPixelBuffer)! + handVisionPixelY * CVPixelBufferGetBytesPerRow(depthPixelBuffer)
let handVisionPoint1Depth = rowDataPoint1.assumingMemoryBound(to: Float32.self)[handVisionPixelX]
print("DEPTH POINT 1 = ", handVisionPoint1Depth)
let rowDataPoint2 = CVPixelBufferGetBaseAddress(depthPixelBuffer)! + handVisionPixe2Y * CVPixelBufferGetBytesPerRow(depthPixelBuffer)
let handVisionPoint2Depth = rowDataPoint2.assumingMemoryBound(to: Float32.self)[handVisionPixe2X]
print("DEPTH POINT 2 = ", handVisionPoint2Depth)
//Int((width - touchPoint.x) * (height - touchPoint.y))
}
现在在我看来,我在深度图中找到正确像素的逻辑是不正确的。如果情况并非如此,那么我想知道数据流是否不同步。但老实说,我现在有点失落。感谢您的帮助!
答案最终相当简单。我在 Apple Dev 论坛上有 Reality-Dev 的帖子(以及他的身体跟踪 git),感谢您为我指明了方向。
在这一行中,我将标准化视觉点转换为屏幕:
let convertedPoints = handPoints.map {
cameraView.previewLayer.layerPointConverted(fromCaptureDevicePoint: $0)
}
这是主要问题。我需要标准化点。一个非常菜鸟的错误。此代码提供了到视觉点的准确深度图距离:
if handPoints.count == 2 {
let handVisionPoint1 = handPoints[0]
let handVisionPoint2 = handPoints[1]
CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
let floatBuffer = unsafeBitCast(CVPixelBufferGetBaseAddress(depthPixelBuffer), to: UnsafeMutablePointer<Float32>.self)
let width = CVPixelBufferGetWidth(depthPixelBuffer)
let height = CVPixelBufferGetHeight(depthPixelBuffer)
let colPosition1 = Int(handVisionPoint1.x * CGFloat(width))
let rowPosition1 = Int(handVisionPoint1.y * CGFloat(height))
let colPosition2 = Int(handVisionPoint2.x * CGFloat(width))
let rowPosition2 = Int(handVisionPoint2.y * CGFloat(height))
guard CVPixelBufferGetPixelFormatType(depthPixelBuffer) == kCVPixelFormatType_DepthFloat32 else { return }
CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
if let baseAddress = CVPixelBufferGetBaseAddress(depthPixelBuffer) {
let width = CVPixelBufferGetWidth(depthPixelBuffer)
let index1 = colPosition1 + (rowPosition1 * width)
let index2 = colPosition2 + (rowPosition1 * width)
let offset1 = index1 * MemoryLayout<Float>.stride
let offset2 = index2 * MemoryLayout<Float>.stride
let distanceValue1 = baseAddress.load(fromByteOffset: offset1, as: Float.self)
let distanceValue2 = baseAddress.load(fromByteOffset: offset2, as: Float.self)
CVPixelBufferUnlockBaseAddress(depthPixelBuffer, .readOnly
}
CVPixelBufferUnlockBaseAddress(depthPixelBuffer, .readOnly)
}