diff --git a/Package.swift b/Package.swift index a2a5f57..d99b026 100644 --- a/Package.swift +++ b/Package.swift @@ -4,7 +4,7 @@ import PackageDescription let package = Package( name: "ElevenLabsSwift", platforms: [ - .iOS(.v15) + .iOS(.v15), ], products: [ .library( @@ -15,6 +15,6 @@ let package = Package( targets: [ .target( name: "ElevenLabsSwift" - ) + ), ] ) diff --git a/Sources/ElevenLabsSwift/ElevenLabsSwift.swift b/Sources/ElevenLabsSwift/ElevenLabsSwift.swift index adf98f4..dba5d5f 100644 --- a/Sources/ElevenLabsSwift/ElevenLabsSwift.swift +++ b/Sources/ElevenLabsSwift/ElevenLabsSwift.swift @@ -1,13 +1,12 @@ -import Foundation import AVFoundation import Combine +import Foundation import os.log - /// Main class for ElevenLabsSwift package public class ElevenLabsSwift { public static let version = "1.0.0" - + private enum Constants { static let defaultApiOrigin = "wss://api.elevenlabs.io" static let defaultApiPathname = "/v1/convai/conversation?agent_id=" @@ -17,19 +16,19 @@ public class ElevenLabsSwift { static let fadeOutDuration: TimeInterval = 2.0 static let bufferSize: AVAudioFrameCount = 1024 } - + // MARK: - Audio Utilities public static func arrayBufferToBase64(_ data: Data) -> String { data.base64EncodedString() } - + public static func base64ToArrayBuffer(_ base64: String) -> Data? { Data(base64Encoded: base64) } - + // MARK: - Audio Processing - + public class AudioConcatProcessor { private var buffers: [Data] = [] private var cursor: Int = 0 @@ -37,13 +36,13 @@ public class ElevenLabsSwift { private var wasInterrupted: Bool = false private var finished: Bool = false public var onProcess: ((Bool) -> Void)? - + public func process(outputs: inout [[Float]]) { var isFinished = false let outputChannel = 0 var outputBuffer = outputs[outputChannel] var outputIndex = 0 - + while outputIndex < outputBuffer.count { if currentBuffer == nil { if buffers.isEmpty { @@ -53,41 +52,41 @@ public class ElevenLabsSwift { currentBuffer = buffers.removeFirst() cursor = 0 } - + if let currentBuffer = currentBuffer { let remainingSamples = currentBuffer.count / 2 - cursor let samplesToWrite = min(remainingSamples, outputBuffer.count - outputIndex) - + guard let int16ChannelData = currentBuffer.withUnsafeBytes({ $0.bindMemory(to: Int16.self).baseAddress }) else { print("Failed to access Int16 channel data.") break } - - for sampleIndex in 0..= currentBuffer.count / 2 { self.currentBuffer = nil } } } - + outputs[outputChannel] = outputBuffer - - if self.finished != isFinished { - self.finished = isFinished + + if finished != isFinished { + finished = isFinished onProcess?(isFinished) } } public func handleMessage(_ message: [String: Any]) { guard let type = message["type"] as? String else { return } - + switch type { case "buffer": if let buffer = message["buffer"] as? Data { @@ -107,39 +106,39 @@ public class ElevenLabsSwift { } } } - + // MARK: - Connection - + public struct SessionConfig: Sendable { public let signedUrl: String? public let agentId: String? - + public init(signedUrl: String) { self.signedUrl = signedUrl - self.agentId = nil + agentId = nil } - + public init(agentId: String) { self.agentId = agentId - self.signedUrl = nil + signedUrl = nil } } - + public class Connection: @unchecked Sendable { public let socket: URLSessionWebSocketTask public let conversationId: String public let sampleRate: Int - + private init(socket: URLSessionWebSocketTask, conversationId: String, sampleRate: Int) { self.socket = socket self.conversationId = conversationId self.sampleRate = sampleRate } - + public static func create(config: SessionConfig) async throws -> Connection { let origin = ProcessInfo.processInfo.environment["ELEVENLABS_CONVAI_SERVER_ORIGIN"] ?? Constants.defaultApiOrigin let pathname = ProcessInfo.processInfo.environment["ELEVENLABS_CONVAI_SERVER_PATHNAME"] ?? Constants.defaultApiPathname - + let urlString: String if let signedUrl = config.signedUrl { urlString = signedUrl @@ -148,147 +147,148 @@ public class ElevenLabsSwift { } else { throw ElevenLabsError.invalidConfiguration } - + guard let url = URL(string: urlString) else { throw ElevenLabsError.invalidURL } - + let session = URLSession(configuration: .default) let socket = session.webSocketTask(with: url) socket.resume() - + let configData = try await receiveInitialMessage(socket: socket) return Connection(socket: socket, conversationId: configData.conversationId, sampleRate: configData.sampleRate) } - + private static func receiveInitialMessage( socket: URLSessionWebSocketTask ) async throws -> (conversationId: String, sampleRate: Int) { return try await withCheckedThrowingContinuation { continuation in socket.receive { result in switch result { - case .success(let message): + case let .success(message): switch message { - case .string(let text): + case let .string(text): guard let data = text.data(using: .utf8), let json = try? JSONSerialization.jsonObject(with: data, options: []) as? [String: Any], let type = json["type"] as? String, type == "conversation_initiation_metadata", let metadata = json["conversation_initiation_metadata_event"] as? [String: Any], let conversationId = metadata["conversation_id"] as? String, - let audioFormat = metadata["agent_output_audio_format"] as? String else { + let audioFormat = metadata["agent_output_audio_format"] as? String + else { continuation.resume(throwing: ElevenLabsError.invalidInitialMessageFormat) return } - + let sampleRate = Int(audioFormat.replacingOccurrences(of: "pcm_", with: "")) ?? 16000 continuation.resume(returning: (conversationId: conversationId, sampleRate: sampleRate)) - + case .data: continuation.resume(throwing: ElevenLabsError.unexpectedBinaryMessage) - + @unknown default: continuation.resume(throwing: ElevenLabsError.unknownMessageType) } - case .failure(let error): + case let .failure(error): continuation.resume(throwing: error) } } } } - + public func close() { socket.cancel(with: .goingAway, reason: nil) } } - + // MARK: - Audio Input - + public class Input { public let engine: AVAudioEngine public let inputNode: AVAudioInputNode public let mixer: AVAudioMixerNode - + private init(engine: AVAudioEngine, inputNode: AVAudioInputNode, mixer: AVAudioMixerNode) { self.engine = engine self.inputNode = inputNode self.mixer = mixer } - - public static func create(sampleRate: Double) async throws -> Input { + + public static func create(sampleRate _: Double) async throws -> Input { let engine = AVAudioEngine() let inputNode = engine.inputNode let mixer = AVAudioMixerNode() - + engine.attach(mixer) engine.connect(inputNode, to: mixer, format: inputNode.inputFormat(forBus: 0)) - + try engine.start() return Input(engine: engine, inputNode: inputNode, mixer: mixer) } - + public func close() { engine.stop() } } - + // MARK: - Output - + public class Output { public let engine: AVAudioEngine public let playerNode: AVAudioPlayerNode public let mixer: AVAudioMixerNode - internal let audioQueue: DispatchQueue - + let audioQueue: DispatchQueue + private init(engine: AVAudioEngine, playerNode: AVAudioPlayerNode, mixer: AVAudioMixerNode) { self.engine = engine self.playerNode = playerNode self.mixer = mixer - self.audioQueue = DispatchQueue(label: "com.elevenlabs.audioQueue", qos: .userInteractive) + audioQueue = DispatchQueue(label: "com.elevenlabs.audioQueue", qos: .userInteractive) } - + public static func create(sampleRate: Double) async throws -> Output { let engine = AVAudioEngine() let playerNode = AVAudioPlayerNode() let mixer = AVAudioMixerNode() - + engine.attach(playerNode) engine.attach(mixer) - + guard let format = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: sampleRate, channels: 1, interleaved: false) else { throw ElevenLabsError.failedToCreateAudioFormat } - + engine.connect(playerNode, to: mixer, format: format) engine.connect(mixer, to: engine.mainMixerNode, format: format) - + return Output(engine: engine, playerNode: playerNode, mixer: mixer) } - + public func close() { engine.stop() } } - + // MARK: - Conversation - + public enum Role: String { case user case ai } - + public enum Mode: String { case speaking case listening } - + public enum Status: String { case connecting case connected case disconnecting case disconnected } - + public struct Callbacks: Sendable { public var onConnect: @Sendable (String) -> Void = { _ in } public var onDisconnect: @Sendable () -> Void = {} @@ -297,77 +297,77 @@ public class ElevenLabsSwift { public var onStatusChange: @Sendable (Status) -> Void = { _ in } public var onModeChange: @Sendable (Mode) -> Void = { _ in } public var onVolumeUpdate: @Sendable (Float) -> Void = { _ in } - + public init() {} } - + public class Conversation: @unchecked Sendable { private let connection: Connection private let input: Input private let output: Output private let callbacks: Callbacks - + private let modeLock = NSLock() private let statusLock = NSLock() private let volumeLock = NSLock() private let lastInterruptTimestampLock = NSLock() private let isProcessingInputLock = NSLock() - + private var inputVolumeUpdateTimer: Timer? private let inputVolumeUpdateInterval: TimeInterval = 0.1 // Update every 100ms private var currentInputVolume: Float = 0.0 - + private var _mode: Mode = .listening private var _status: Status = .connecting private var _volume: Float = 1.0 private var _lastInterruptTimestamp: Int = 0 private var _isProcessingInput: Bool = true - + private var mode: Mode { get { modeLock.withLock { _mode } } set { modeLock.withLock { _mode = newValue } } } - + private var status: Status { get { statusLock.withLock { _status } } set { statusLock.withLock { _status = newValue } } } - + private var volume: Float { get { volumeLock.withLock { _volume } } set { volumeLock.withLock { _volume = newValue } } } - + private var lastInterruptTimestamp: Int { get { lastInterruptTimestampLock.withLock { _lastInterruptTimestamp } } set { lastInterruptTimestampLock.withLock { _lastInterruptTimestamp = newValue } } } - + private var isProcessingInput: Bool { get { isProcessingInputLock.withLock { _isProcessingInput } } set { isProcessingInputLock.withLock { _isProcessingInput = newValue } } } - + private var audioBuffers: [AVAudioPCMBuffer] = [] private let audioBufferLock = NSLock() - + private var previousSamples: [Int16] = Array(repeating: 0, count: 10) private var isFirstBuffer = true - + private let audioConcatProcessor = ElevenLabsSwift.AudioConcatProcessor() private var outputBuffers: [[Float]] = [[]] - + private let logger = Logger(subsystem: "com.elevenlabs.ElevenLabsSwift", category: "Conversation") - + private func setupInputVolumeMonitoring() { - DispatchQueue.main.async { - self.inputVolumeUpdateTimer = Timer.scheduledTimer(withTimeInterval: self.inputVolumeUpdateInterval, repeats: true) { [weak self] _ in - guard let self = self else { return } - self.callbacks.onVolumeUpdate(self.currentInputVolume) - } - } - } - + DispatchQueue.main.async { + self.inputVolumeUpdateTimer = Timer.scheduledTimer(withTimeInterval: self.inputVolumeUpdateInterval, repeats: true) { [weak self] _ in + guard let self = self else { return } + self.callbacks.onVolumeUpdate(self.currentInputVolume) + } + } + } + private func processAudioBuffer(_ buffer: AVAudioPCMBuffer) { guard let channelData = buffer.floatChannelData else { return @@ -375,11 +375,11 @@ public class ElevenLabsSwift { var sumOfSquares: Float = 0 let channelCount = Int(buffer.format.channelCount) - let frameLength = Int(buffer.frameLength) // Convert to Int + let frameLength = Int(buffer.frameLength) // Convert to Int - for channel in 0...size) - + // Encode and send only if we have data if !data.isEmpty { let base64 = data.base64EncodedString() let message: [String: Any] = ["type": "user_audio_chunk", - "user_audio_chunk": base64] + "user_audio_chunk": base64] self.sendWebSocketMessage(message) } - + // Update volume meter self.updateVolume(buffer) } - + output.engine.prepare() } private func updateVolume(_ buffer: AVAudioPCMBuffer) { - guard let channelData = buffer.floatChannelData else { return } + guard let channelData = buffer.floatChannelData else { return } - var sum: Float = 0 - let channelCount = Int(buffer.format.channelCount) + var sum: Float = 0 + let channelCount = Int(buffer.format.channelCount) - for channel in 0...size guard let audioBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: AVAudioFrameCount(frameCount)) else { callbacks.onError("Failed to create AVAudioPCMBuffer", nil) return } - + audioBuffer.frameLength = AVAudioFrameCount(frameCount) - + data.withUnsafeBytes { (int16Buffer: UnsafeRawBufferPointer) in let int16Pointer = int16Buffer.bindMemory(to: Int16.self).baseAddress! if let floatChannelData = audioBuffer.floatChannelData { - for i in 0.. String { connection.conversationId } - + /// Retrieves the input volume /// - Returns: Current input volume public func getInputVolume() -> Float { input.mixer.volume } - + /// Retrieves the output volume /// - Returns: Current output volume public func getOutputVolume() -> Float { output.mixer.volume } - + /// Starts recording audio input public func startRecording() { isProcessingInput = true } - + /// Stops recording audio input public func stopRecording() { isProcessingInput = false } - + private func clearAudioBuffers() { audioBufferLock.withLock { audioBuffers.removeAll() } audioConcatProcessor.handleMessage(["type": "clearInterrupted"]) } - + private func stopPlayback() { output.audioQueue.async { [weak self] in guard let self = self else { return } @@ -826,9 +826,9 @@ public class ElevenLabsSwift { } } } - + // MARK: - Errors - + /// Defines errors specific to ElevenLabsSwift public enum ElevenLabsError: Error, LocalizedError { case invalidConfiguration @@ -837,7 +837,7 @@ public class ElevenLabsSwift { case unexpectedBinaryMessage case unknownMessageType case failedToCreateAudioFormat - + public var errorDescription: String? { switch self { case .invalidConfiguration: @@ -855,31 +855,31 @@ public class ElevenLabsSwift { } } } - + // MARK: - Audio Session Configuration - + private static func configureAudioSession() throws { let audioSession = AVAudioSession.sharedInstance() do { // Configure for voice chat with minimum latency try audioSession.setCategory(.playAndRecord, mode: .voiceChat, - options: [ .allowBluetooth]) - + options: [.allowBluetooth]) + // Set preferred IO buffer duration for lower latency try audioSession.setPreferredIOBufferDuration(0.005) // 5ms buffer - + // Set preferred sample rate to match our target Note most IOS devices aren't able to go down this low try audioSession.setPreferredSampleRate(16000) - + // Request input gain control if available if audioSession.isInputGainSettable { try audioSession.setInputGain(1.0) } - + // Activate the session try audioSession.setActive(true, options: .notifyOthersOnDeactivation) - + } catch { print("Failed to configure audio session: \(error.localizedDescription)") throw error