diff --git a/Package.swift b/Package.swift index bea6a5e..3a75e13 100644 --- a/Package.swift +++ b/Package.swift @@ -3,7 +3,7 @@ import Foundation import PackageDescription -let hasFFmpeg = ProcessInfo.processInfo.environment["USE_FFMPEG"] == "1" +let hasFFmpeg = true // ProcessInfo.processInfo.environment["USE_FFMPEG"] == "1" #if arch(arm64) let includePath = "-I/opt/homebrew/include" @@ -42,7 +42,7 @@ let package = Package( ] + (hasFFmpeg ? [ .systemLibrary( name: "CFFmpeg", - pkgConfig: "libavformat libavcodec libavutil", + pkgConfig: "libavformat libavcodec libavutil libswscale", providers: [ .brew(["ffmpeg"]) ]) diff --git a/Sources/CFFmpeg/CFFmpeg.h b/Sources/CFFmpeg/CFFmpeg.h index bec3f52..4dde78e 100644 --- a/Sources/CFFmpeg/CFFmpeg.h +++ b/Sources/CFFmpeg/CFFmpeg.h @@ -4,5 +4,6 @@ #include #include #include +#include #endif diff --git a/Sources/CFFmpeg/module.modulemap b/Sources/CFFmpeg/module.modulemap index 23970e6..b622e6b 100644 --- a/Sources/CFFmpeg/module.modulemap +++ b/Sources/CFFmpeg/module.modulemap @@ -3,5 +3,6 @@ module CFFmpeg [system] { link "avformat" link "avcodec" link "avutil" + link "swscale" export * } diff --git a/Sources/macSubtitleOCR/Subtitles/FFmpeg/FFmpegBurnin.swift b/Sources/macSubtitleOCR/Subtitles/FFmpeg/FFmpegBurnin.swift new file mode 100644 index 0000000..963fdd1 --- /dev/null +++ b/Sources/macSubtitleOCR/Subtitles/FFmpeg/FFmpegBurnin.swift @@ -0,0 +1,265 @@ +// +// FFmpegBurnin.swift +// macSubtitleOCR +// +// Created by Ethan Dye on 1/1/25. +// Copyright © 2025-2025 Ethan Dye. All rights reserved. +// + +#if FFMPEG +import CFFmpeg +import CoreGraphics +import Foundation +import os +import Vision + +struct FFmpegBurnin { + // MARK: - Properties + + private let logger = Logger(subsystem: "com.ecdye.macSubtitleOCR", category: "FFmpeg") + private(set) var images = [Int: [Subtitle]]() + private var imagesIndex = [Int: Int]() + + // MARK: - Lifecycle + + init(_ sub: String) throws { + processMKVWithVision(mkvFilePath: sub) + // var fmtCtx: UnsafeMutablePointer? + + // // Open the input file + // if avformat_open_input(&fmtCtx, sub, nil, nil) != 0 { + // throw macSubtitleOCRError.fileReadError("Failed to open input file: \(sub)") + // } + // defer { avformat_close_input(&fmtCtx) } + + // // Retrieve stream information + // if avformat_find_stream_info(fmtCtx, nil) < 0 { + // throw macSubtitleOCRError.ffmpegError("FFmpeg failed to find stream info") + // } + + // // Iterate over all streams and find subtitle tracks + // var streamsToProcess = [Int: FFStream]() + // for i in 0 ..< Int(fmtCtx!.pointee.nb_streams) { + // let stream = fmtCtx!.pointee.streams[i]!.pointee + // if stream.codecpar.pointee.codec_type == AVMEDIA_TYPE_VIDEO { + // let codecParameters = stream.codecpar + // let timeBase = stream.time_base + // let ffStream = FFStream(codecParameters: codecParameters, timeBase: timeBase) + // streamsToProcess[i] = ffStream + // } + // } + + // processVideoTracks(fmtCtx: fmtCtx, streams: streamsToProcess) + } + + // MARK: - Methods + + private mutating func processVideoTracks(fmtCtx: UnsafeMutablePointer?, + streams: [Int: FFStream]) { + // Allocate packet + var packet = av_packet_alloc() + defer { av_packet_free(&packet) } + + // Prepare a frame + var frame = av_frame_alloc() + defer { av_frame_free(&frame) } + + // Read frames for the specific subtitle stream + while av_read_frame(fmtCtx, packet) >= 0 { + defer { av_packet_unref(packet) } + let streamNumber = Int(packet!.pointee.stream_index) + logger.debug("Got packet for stream \(streamNumber)") + + if streams[streamNumber] == nil { + continue // Skip if stream is not a subtitle stream + } + let stream = streams[streamNumber]! + + // Send packet to decoder + guard avcodec_send_packet(stream.codecContext, packet) >= 0 else { + logger.warning("Failed to send packet for stream \(streamNumber), skipping...") + continue + } + + while avcodec_receive_frame(stream.codecContext, frame) >= 0 { + // let pts = convertToTimeInterval(frame!.pointee.pts, timeBase: stream.timeBase) + + // Analyze the frame for burned-in subtitles + analyzeFrameForSubtitles(frame!, streamNumber: streamNumber) + } + } + } + + private mutating func analyzeFrameForSubtitles(_ frame: UnsafeMutablePointer, streamNumber: Int) { + // Convert the frame to an image format for OCR + if imagesIndex[streamNumber] == nil { + imagesIndex[streamNumber] = 1 + } else { + imagesIndex[streamNumber]! += 1 + } + guard let image = convertFrameToImage(frame) else { + print("Skipping frame \(imagesIndex[streamNumber]!)") + return + } + let pts = convertToTimeInterval(frame.pointee.pts, timeBase: frame.pointee.time_base) + if images[streamNumber] == nil { + images[streamNumber] = [] + } + images[streamNumber]!.append(Subtitle(index: imagesIndex[streamNumber]!, startTimestamp: pts, image: image)) + } + + func convertFrameToImage(_ frame: UnsafeMutablePointer) -> CGImage? { + // Get frame dimensions + let width = Int(frame.pointee.width) + let height = Int(frame.pointee.height) + + // Create a buffer for the pixel data in RGBA format + guard let rgbaBuffer = av_malloc(width * height * 4) else { + print("Failed to allocate RGBA buffer") + return nil + } + defer { av_free(rgbaBuffer) } + + // Prepare the SwsContext for conversion + guard let swsContext = sws_getContext( + frame.pointee.width, frame.pointee.height, AVPixelFormat(frame.pointee.format), + frame.pointee.width, frame.pointee.height, AV_PIX_FMT_RGBA, + SWS_BILINEAR, nil, nil, nil) else { + print("Failed to initialize swsContext") + return nil + } + defer { sws_freeContext(swsContext) } + + // Prepare an array for the output frame lines + var rgbaFrameData = [UnsafeMutablePointer?](repeating: nil, count: 4) + var rgbaLineSize = [Int32](repeating: 0, count: 4) + rgbaFrameData[0] = rgbaBuffer.assumingMemoryBound(to: UInt8.self) + rgbaLineSize[0] = Int32(width * 4) + + // Convert the frame to RGBA format + withUnsafePointer(to: frame.pointee.data) { dataPointer in + withUnsafePointer(to: frame.pointee.linesize) { linesizePointer in + let dataPointer = UnsafeRawPointer(dataPointer).assumingMemoryBound(to: UnsafePointer?.self) + let linesizePointer = UnsafeRawPointer(linesizePointer).assumingMemoryBound(to: Int32.self) + sws_scale( + swsContext, + dataPointer, + linesizePointer, + 0, + frame.pointee.height, + &rgbaFrameData, + &rgbaLineSize) + } + } + + // Create a CGDataProvider from the RGBA buffer + guard let dataProvider = CGDataProvider(data: Data( + bytesNoCopy: rgbaBuffer, + count: width * height * 4, + deallocator: .none) as CFData) else { + print("Failed to create CGDataProvider") + return nil + } + + // Create the CGImage + let colorSpace = CGColorSpaceCreateDeviceRGB() + return CGImage( + width: width, + height: height, + bitsPerComponent: 8, + bitsPerPixel: 32, + bytesPerRow: width * 4, + space: colorSpace, + bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.premultipliedLast.rawValue), + provider: dataProvider, + decode: nil, + shouldInterpolate: true, + intent: .defaultIntent) + } + + private func convertToTimeInterval(_ pts: some BinaryInteger, timeBase: AVRational) -> TimeInterval { + let seconds = Double(pts) * av_q2d(timeBase) + return TimeInterval(seconds) + } + + func processMKVWithVision(mkvFilePath: String) { + // Initialize FFmpeg + av_log_set_level(AV_LOG_QUIET) + var fmtCtx: UnsafeMutablePointer? + + // Open the file + if avformat_open_input(&fmtCtx, mkvFilePath, nil, nil) != 0 { + fatalError("Failed to open input file") + } + defer { avformat_close_input(&fmtCtx) } + + // Find the best video stream + var avCodec: UnsafePointer? + let videoStreamIndex = av_find_best_stream(fmtCtx, AVMEDIA_TYPE_VIDEO, -1, -1, &avCodec, 0) + guard videoStreamIndex >= 0 else { fatalError("No video stream found") } + + var codecCtx = avcodec_alloc_context3(avCodec) + defer { avcodec_free_context(&codecCtx) } + + avcodec_parameters_to_context(codecCtx, fmtCtx!.pointee.streams[Int(videoStreamIndex)]!.pointee.codecpar) + guard let codec = avcodec_find_decoder(codecCtx!.pointee.codec_id) else { + fatalError("Unsupported codec") + } + if avcodec_open2(codecCtx, codec, nil) < 0 { + fatalError("Failed to open codec") + } + + // Allocate frame and packet + var frame = av_frame_alloc() + var packet = av_packet_alloc() + defer { + av_frame_free(&frame) + av_packet_free(&packet) + } + + // Initialize Vision Request + let textRequest = VNRecognizeTextRequest { request, error in + guard error == nil else { + print("Vision request error: \(error!.localizedDescription)") + return + } + + if let results = request.results as? [VNRecognizedTextObservation] { + for observation in results { + print("Detected text: \(observation.topCandidates(1).first?.string ?? "")") + } + } + } + textRequest.recognitionLevel = .accurate + textRequest.regionOfInterest = CGRect(x: 0, y: 0, width: 1, height: 0.25) + + // Read frames and process + while av_read_frame(fmtCtx, packet) >= 0 { + if packet!.pointee.stream_index == videoStreamIndex { + // Send packet to decoder + if avcodec_send_packet(codecCtx, packet) >= 0 { + while avcodec_receive_frame(codecCtx, frame) >= 0 { + // Convert frame to CGImage + guard let cgImage = convertFrameToImage(frame!) else { + print("Failed to convert frame to CGImage") + continue + } + + // Process image with Vision + let requestHandler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + do { + try requestHandler.perform([textRequest]) + } catch { + print("Failed to perform Vision request: \(error.localizedDescription)") + } + } + } + } + av_packet_unref(packet) + } + + // Free FFmpeg resources + avformat_close_input(&fmtCtx) + } +} +#endif diff --git a/Sources/macSubtitleOCR/Subtitles/Subtitle.swift b/Sources/macSubtitleOCR/Subtitles/Subtitle.swift index a35181e..386a2f5 100644 --- a/Sources/macSubtitleOCR/Subtitles/Subtitle.swift +++ b/Sources/macSubtitleOCR/Subtitles/Subtitle.swift @@ -20,6 +20,7 @@ class Subtitle: @unchecked Sendable { var imageData: Data? var imagePalette: [UInt8]? var imageAlpha: [UInt8]? + var image: CGImage? var numberOfColors: Int? var endTimestamp: TimeInterval? var evenOffset: Int? @@ -27,7 +28,8 @@ class Subtitle: @unchecked Sendable { init(index: Int, text: String? = nil, startTimestamp: TimeInterval? = nil, endTimestamp: TimeInterval? = nil, imageXOffset: Int? = nil, imageYOffset: Int? = nil, imageWidth: Int? = nil, imageHeight: Int? = nil, - imageData: Data? = nil, imagePalette: [UInt8]? = nil, imageAlpha: [UInt8]? = nil, numberOfColors: Int? = nil, + imageData: Data? = nil, imagePalette: [UInt8]? = nil, imageAlpha: [UInt8]? = nil, image: CGImage? = nil, + numberOfColors: Int? = nil, evenOffset: Int? = nil, oddOffset: Int? = nil) { self.index = index self.text = text @@ -40,6 +42,7 @@ class Subtitle: @unchecked Sendable { self.imageData = imageData self.imagePalette = imagePalette self.imageAlpha = imageAlpha + self.image = image self.numberOfColors = numberOfColors self.evenOffset = evenOffset self.oddOffset = oddOffset @@ -49,6 +52,9 @@ class Subtitle: @unchecked Sendable { // Converts the RGBA data to a CGImage func createImage(_ invert: Bool) -> CGImage? { + if image != nil { + return image + } var rgbaData = imageDataToRGBA() var minX = imageWidth!, maxX = 0, minY = imageHeight!, maxY = 0 diff --git a/Sources/macSubtitleOCR/macSubtitleOCR.swift b/Sources/macSubtitleOCR/macSubtitleOCR.swift index 80a9833..aa5c142 100644 --- a/Sources/macSubtitleOCR/macSubtitleOCR.swift +++ b/Sources/macSubtitleOCR/macSubtitleOCR.swift @@ -55,9 +55,13 @@ struct macSubtitleOCR: AsyncParsableCommand { private mutating func processInput() async throws -> [macSubtitleOCRResult] { #if FFMPEG if options.ffmpegDecoder { - try await processFFmpegDecoder() + if options.burnIn { + return try await processFFmpegBurnin() + } else { + return try await processFFmpegDecoder() + } } else { - try await processInternalDecoder() + return try await processInternalDecoder() } #else try await processInternalDecoder() @@ -120,6 +124,19 @@ struct macSubtitleOCR: AsyncParsableCommand { } #if FFMPEG + private func processFFmpegBurnin() async throws -> [macSubtitleOCRResult] { + var results: [macSubtitleOCRResult] = [] + let ffmpeg = try FFmpegBurnin(input) + + for result in ffmpeg.images { + logger.debug("Processing subtitle track: \(result.key)") + let result = try await processSubtitle(result.value, trackNumber: result.key) + results.append(result) + } + + return results + } + private func processFFmpegDecoder() async throws -> [macSubtitleOCRResult] { var results: [macSubtitleOCRResult] = [] let ffmpeg = try FFmpeg(input) diff --git a/Sources/macSubtitleOCR/macSubtitleOCROptions.swift b/Sources/macSubtitleOCR/macSubtitleOCROptions.swift index 9e56fd1..68f79fb 100644 --- a/Sources/macSubtitleOCR/macSubtitleOCROptions.swift +++ b/Sources/macSubtitleOCR/macSubtitleOCROptions.swift @@ -35,6 +35,9 @@ struct Options: ParsableArguments { #if FFMPEG @Flag(name: [.customShort("f"), .long], help: "Use FFmpeg decoder") var ffmpegDecoder = false + + @Flag(name: [.customShort("b"), .long], help: "Extract burned in subtitles from video") + var burnIn = false #endif @Flag(help: "Disable correction of 'l' to 'I' in OCR results")