虽然现在离线OCR工具包也挺多的,但是苹果系统自带的OCR还是有一点点优势,毕竟每个M1 mbp都带上了独立的AI加速单元,系统自带的OCR识别服务在识别率和速度上相对开源方案有可取之处,也出了V1,V2,V3在不断优化。本文使用swift调用自带的VNRecognizeTextRequest服务,并导出C API, 然后使用python的ctypes调用该动态链接库。按照此原理,其他语言也可以使用此方案调用该C API

如果你是苹果用户,并且想试试系统自带的OCR功能,本文可能对你有一点点用

Swift调用OCR服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import CoreImage
import Vision

@_cdecl("OCRTextFromImage")
public func OCRTextFromImage(filePath: UnsafePointer<CChar>) -> UnsafePointer<CChar>? {
var text = ""
let request = VNRecognizeTextRequest { (request, error) in
guard let observations = request.results as? [VNRecognizedTextObservation] else {
return
}

for observation in observations {
guard let topCandidate = observation.topCandidates(1).first else { continue }
text += topCandidate.string + "\n"
}
}
request.recognitionLevel = VNRequestTextRecognitionLevel.accurate
request.usesLanguageCorrection = true
request.revision = VNRecognizeTextRequestRevision3
request.recognitionLanguages = ["zh", "en"]

let imageURL = URL(fileURLWithPath: String(cString: filePath))
guard let image = CIImage(contentsOf: imageURL) else {
return UnsafePointer<CChar>(strdup(""))
}
let handler = VNImageRequestHandler(ciImage: image, options: [:])
try? handler.perform([request])

return UnsafePointer<CChar>(strdup(text))
}

使用此命令进行编译swiftc -o OCRTextFromImage ocr.swift -emit-library

Python调用C API

1
2
3
4
5
6
7
8
9
import ctypes

ocr = ctypes.CDLL("OCRTextFromImage")
ocr.OCRTextFromImage.restype = ctypes.c_char_p
ocr.OCRTextFromImage.argtypes = [ctypes.c_char_p]

ret = ocr.OCRTextFromImage(ctypes.create_string_buffer(b"WechatIMG87.png"))
for i in ret.split(b'\n'):
print(i.decode('utf-8'))

其他

网上也有人使用objc进行调用,但是需要对obj-c有一点了解。比如这个, 还有这个textinator

至于识别效果,我个人觉得对印刷体识别还过得去,更多VNRecognizeTextRequest细节请参照官方文档,可以实现更精细的结果,比如给出识别到的位置等