Skip to content

Commit

Permalink
Support Word Timestamps (#38)
Browse files Browse the repository at this point in the history
* Add word level timestamp handling

* Make wordtimings properties public

Co-authored-by: Finn Voorhees <[email protected]>

* Fix crash for empty alignments

Co-authored-by: Finn Voorhees <[email protected]>

* Update and test merging logic, fix alignment off by one issue

* Add remaining word timestamp heuristics, remove special tokens

* Fix sampleLength early loop termination

---------

Co-authored-by: Finn Voorhees <[email protected]>
  • Loading branch information
ZachNagengast and finnvoor authored Mar 2, 2024
1 parent 2846fd9 commit dda6571
Show file tree
Hide file tree
Showing 14 changed files with 1,385 additions and 455 deletions.
312 changes: 154 additions & 158 deletions Examples/WhisperAX/WhisperAX/Views/ContentView.swift

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Sources/WhisperKit/Core/AudioEncoder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public class AudioEncoder: AudioEncoding, WhisperMLModel {
public init() {}

public func encodeFeatures(_ features: MLMultiArray) async throws -> MLMultiArray? {
// Make sure features is shape MultiArray (Float32 1 × 80,128 × 3000)
// Make sure features is shape MultiArray (Float32 1 × {80,128} × 3000)
let modelInputs = AudioEncoderInput(melspectrogram_features: features)

guard let model = model else {
Expand Down
19 changes: 9 additions & 10 deletions Sources/WhisperKit/Core/AudioProcessor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,8 @@ public protocol AudioProcessing {
func stopRecording()
}

// Overrideable default methods for AudioProcessing
/// Overrideable default methods for AudioProcessing
public extension AudioProcessing {
// Use default recording device
func startRecordingLive(callback: (([Float]) -> Void)?) throws {
try startRecordingLive(callback: callback)
}
Expand Down Expand Up @@ -229,12 +228,12 @@ public class AudioProcessor: NSObject, AudioProcessing {
var error: NSError?
let status = converter.convert(to: outputBuffer, error: &error, withInputFrom: inputBlock)
switch status {
case .error:
if let conversionError = error {
Logging.error("Error converting audio file: \(conversionError)")
}
return nil
default: break
case .error:
if let conversionError = error {
Logging.error("Error converting audio file: \(conversionError)")
}
return nil
default: break
}

return outputBuffer
Expand Down Expand Up @@ -288,7 +287,7 @@ public class AudioProcessor: NSObject, AudioProcessing {
let refEnergy = 20 * log10(referenceEnergy)

// Normalize based on reference
// Note: since signalEnergy elements are floats from 0 to 1, max (full volume) is always 0dB
// NOTE: since signalEnergy elements are floats from 0 to 1, max (full volume) is always 0dB
let normalizedEnergy = rescale(value: dbEnergy, min: refEnergy, max: 0)

// Clamp from 0 to 1
Expand Down Expand Up @@ -316,7 +315,7 @@ public class AudioProcessor: NSObject, AudioProcessing {
@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *)
public extension AudioProcessor {
/// We have a new buffer, process and store it.
/// Note: Assumes audio is 16khz mono
/// NOTE: Assumes audio is 16khz mono
func processBuffer(_ buffer: [Float]) {
audioSamples.append(contentsOf: buffer)

Expand Down
5 changes: 3 additions & 2 deletions Sources/WhisperKit/Core/AudioStreamTranscriber.swift
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

import Foundation

extension AudioStreamTranscriber {
public struct State {
public extension AudioStreamTranscriber {
struct State {
public var isRecording: Bool = false
public var currentFallbacks: Int = 0
public var lastBufferSize: Int = 0
Expand All @@ -26,6 +26,7 @@ public actor AudioStreamTranscriber {
stateChangeCallback?(oldValue, state)
}
}

private let stateChangeCallback: AudioStreamTranscriberCallback?

private let requiredSegmentsForConfirmation: Int
Expand Down
Loading

0 comments on commit dda6571

Please sign in to comment.