ml-explore
diff --git a/‎Libraries/Embedders/Load.swift‎
Lines changed: 0 additions & 7 deletions b/‎Libraries/Embedders/Load.swift‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎Libraries/Embedders/Qwen3.swift‎
Lines changed: 3 additions & 3 deletions b/‎Libraries/Embedders/Qwen3.swift‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Libraries/MLXLLM/LLMModel.swift‎
Lines changed: 2 additions & 3 deletions b/‎Libraries/MLXLLM/LLMModel.swift‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎Libraries/MLXLLM/LLMModelFactory.swift‎
Lines changed: 48 additions & 53 deletions b/‎Libraries/MLXLLM/LLMModelFactory.swift‎
Lines changed: 48 additions & 53 deletions
diff --git a/‎Libraries/MLXLLM/Models/AfMoE.swift‎
Lines changed: 3 additions & 23 deletions b/‎Libraries/MLXLLM/Models/AfMoE.swift‎
Lines changed: 3 additions & 23 deletions
diff --git a/‎Libraries/MLXLLM/Models/BailingMoe.swift‎
Lines changed: 3 additions & 3 deletions b/‎Libraries/MLXLLM/Models/BailingMoe.swift‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Libraries/MLXLLM/Models/DeepseekV3.swift‎
Lines changed: 1 addition & 2 deletions b/‎Libraries/MLXLLM/Models/DeepseekV3.swift‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎Libraries/MLXLLM/Models/Gemma3Text.swift‎
Lines changed: 0 additions & 1 deletion b/‎Libraries/MLXLLM/Models/Gemma3Text.swift‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎Libraries/MLXLLM/Models/Granite.swift‎
Lines changed: 1 addition & 2 deletions b/‎Libraries/MLXLLM/Models/Granite.swift‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎Libraries/MLXLLM/Models/GraniteMoeHybrid.swift‎
Lines changed: 5 additions & 5 deletions b/‎Libraries/MLXLLM/Models/GraniteMoeHybrid.swift‎
Lines changed: 5 additions & 5 deletions
@@ -91,13 +91,6 @@ func loadSynchronous(modelDirectory: URL) throws -> EmbeddingModel {
         }
     }
 
-    if let quantization = baseConfig.quantization {
-        quantize(model: model, groupSize: quantization.groupSize, bits: quantization.bits) {
-            path, module in
-            weights["\(path).scales"] != nil
-        }
-    }
-
     // apply the loaded weights
     let parameters = ModuleParameters.unflattened(weights)
     try model.update(parameters: parameters, verify: [.all])
 
@@ -60,7 +60,7 @@ private class Attention: Module {
     }
 
     public func callAsFunction(
-        _ x: MLXArray, mask: MLXArray? = nil, cache: KVCache?
+        _ x: MLXArray, mask: MLXFast.ScaledDotProductAttentionMaskMode, cache: KVCache?
     ) -> MLXArray {
         let (B, L) = (x.dim(0), x.dim(1))
 
@@ -125,7 +125,7 @@ private class TransformerBlock: Module {
     }
 
     public func callAsFunction(
-        _ x: MLXArray, mask: MLXArray? = nil, cache: KVCache?
+        _ x: MLXArray, mask: MLXFast.ScaledDotProductAttentionMaskMode, cache: KVCache?
     ) -> MLXArray {
         var r = attention(inputLayerNorm(x), mask: mask, cache: cache)
         let h = x + r
@@ -157,7 +157,7 @@ private class Qwen3ModelInner: Module {
     public func callAsFunction(_ inputs: MLXArray, cache: [KVCache]? = nil) -> MLXArray {
         var h = embedTokens(inputs)
 
-        let mask: MLXArray? = createAttentionMask(h: h, cache: cache)
+        let mask = createAttentionMask(h: h, cache: cache?.first)
 
         for (i, layer) in layers.enumerated() {
             h = layer(h, mask: mask, cache: cache?[i])
 
@@ -24,12 +24,11 @@ extension LLMModel {
     {
         let prefillStepSize = windowSize ?? 512
         var y = input.text
-        var state: LMOutput.State? = nil
 
-        // prepare the prompt in chunks if larger than the prefill size
+        // Prepare the prompt in chunks if larger than the prefill size
         while y.tokens.size > prefillStepSize {
             let input = y[.newAxis, ..<prefillStepSize]
-            let result = self(input, cache: cache.isEmpty ? nil : cache, state: state)
+            _ = self(input, cache: cache.isEmpty ? nil : cache, state: nil)
             eval(cache)
             y = y[prefillStepSize...]
         }
 
@@ -20,58 +20,53 @@ private func create<C: Codable, M>(
 /// Registry of model type, e.g 'llama', to functions that can instantiate the model from configuration.
 ///
 /// Typically called via ``LLMModelFactory/load(hub:configuration:progressHandler:)``.
-public class LLMTypeRegistry: ModelTypeRegistry, @unchecked Sendable {
+public enum LLMTypeRegistry {
 
     /// Shared instance with default model types.
-    public static let shared: LLMTypeRegistry = .init(creators: all())
-
-    /// All predefined model types.
-    private static func all() -> [String: @Sendable (URL) throws -> any LanguageModel] {
-        [
-            "mistral": create(LlamaConfiguration.self, LlamaModel.init),
-            "llama": create(LlamaConfiguration.self, LlamaModel.init),
-            "phi": create(PhiConfiguration.self, PhiModel.init),
-            "phi3": create(Phi3Configuration.self, Phi3Model.init),
-            "phimoe": create(PhiMoEConfiguration.self, PhiMoEModel.init),
-            "gemma": create(GemmaConfiguration.self, GemmaModel.init),
-            "gemma2": create(Gemma2Configuration.self, Gemma2Model.init),
-            "gemma3": create(Gemma3TextConfiguration.self, Gemma3TextModel.init),
-            "gemma3_text": create(Gemma3TextConfiguration.self, Gemma3TextModel.init),
-            "gemma3n": create(Gemma3nTextConfiguration.self, Gemma3nTextModel.init),
-            "qwen2": create(Qwen2Configuration.self, Qwen2Model.init),
-            "qwen3": create(Qwen3Configuration.self, Qwen3Model.init),
-            "qwen3_moe": create(Qwen3MoEConfiguration.self, Qwen3MoEModel.init),
-            "starcoder2": create(Starcoder2Configuration.self, Starcoder2Model.init),
-            "cohere": create(CohereConfiguration.self, CohereModel.init),
-            "openelm": create(OpenElmConfiguration.self, OpenELMModel.init),
-            "internlm2": create(InternLM2Configuration.self, InternLM2Model.init),
-            "deepseek_v3": create(DeepseekV3Configuration.self, DeepseekV3Model.init),
-            "granite": create(GraniteConfiguration.self, GraniteModel.init),
-            "granitemoehybrid": create(
-                GraniteMoeHybridConfiguration.self, GraniteMoeHybridModel.init),
-            "mimo": create(MiMoConfiguration.self, MiMoModel.init),
-            "glm4": create(GLM4Configuration.self, GLM4Model.init),
-            "acereason": create(Qwen2Configuration.self, Qwen2Model.init),
-            "falcon_h1": create(FalconH1Configuration.self, FalconH1Model.init),
-            "bitnet": create(BitnetConfiguration.self, BitnetModel.init),
-            "smollm3": create(SmolLM3Configuration.self, SmolLM3Model.init),
-            "ernie4_5": create(Ernie45Configuration.self, Ernie45Model.init),
-            "lfm2": create(LFM2Configuration.self, LFM2Model.init),
-            "baichuan_m1": create(BaichuanM1Configuration.self, BaichuanM1Model.init),
-            "exaone4": create(Exaone4Configuration.self, Exaone4Model.init),
-            "gpt_oss": create(GPTOSSConfiguration.self, GPTOSSModel.init),
-            "lille-130m": create(Lille130mConfiguration.self, Lille130mModel.init),
-            "olmoe": create(OlmoEConfiguration.self, OlmoEModel.init),
-            "olmo2": create(Olmo2Configuration.self, Olmo2Model.init),
-            "olmo3": create(Olmo3Configuration.self, Olmo3Model.init),
-            "bailing_moe": create(BailingMoeConfiguration.self, BailingMoeModel.init),
-            "lfm2_moe": create(LFM2MoEConfiguration.self, LFM2MoEModel.init),
-            "nanochat": create(NanoChatConfiguration.self, NanoChatModel.init),
-            "afmoe": create(AfMoEConfiguration.self, AfMoEModel.init),
-            "jamba_3b": create(JambaConfiguration.self, JambaModel.init),
-            "mistral3": create(Mistral3TextConfiguration.self, Mistral3TextModel.init),
-        ]
-    }
+    public static let shared: ModelTypeRegistry = .init(creators: [
+        "mistral": create(LlamaConfiguration.self, LlamaModel.init),
+        "llama": create(LlamaConfiguration.self, LlamaModel.init),
+        "phi": create(PhiConfiguration.self, PhiModel.init),
+        "phi3": create(Phi3Configuration.self, Phi3Model.init),
+        "phimoe": create(PhiMoEConfiguration.self, PhiMoEModel.init),
+        "gemma": create(GemmaConfiguration.self, GemmaModel.init),
+        "gemma2": create(Gemma2Configuration.self, Gemma2Model.init),
+        "gemma3": create(Gemma3TextConfiguration.self, Gemma3TextModel.init),
+        "gemma3_text": create(Gemma3TextConfiguration.self, Gemma3TextModel.init),
+        "gemma3n": create(Gemma3nTextConfiguration.self, Gemma3nTextModel.init),
+        "qwen2": create(Qwen2Configuration.self, Qwen2Model.init),
+        "qwen3": create(Qwen3Configuration.self, Qwen3Model.init),
+        "qwen3_moe": create(Qwen3MoEConfiguration.self, Qwen3MoEModel.init),
+        "starcoder2": create(Starcoder2Configuration.self, Starcoder2Model.init),
+        "cohere": create(CohereConfiguration.self, CohereModel.init),
+        "openelm": create(OpenElmConfiguration.self, OpenELMModel.init),
+        "internlm2": create(InternLM2Configuration.self, InternLM2Model.init),
+        "deepseek_v3": create(DeepseekV3Configuration.self, DeepseekV3Model.init),
+        "granite": create(GraniteConfiguration.self, GraniteModel.init),
+        "granitemoehybrid": create(
+            GraniteMoeHybridConfiguration.self, GraniteMoeHybridModel.init),
+        "mimo": create(MiMoConfiguration.self, MiMoModel.init),
+        "glm4": create(GLM4Configuration.self, GLM4Model.init),
+        "acereason": create(Qwen2Configuration.self, Qwen2Model.init),
+        "falcon_h1": create(FalconH1Configuration.self, FalconH1Model.init),
+        "bitnet": create(BitnetConfiguration.self, BitnetModel.init),
+        "smollm3": create(SmolLM3Configuration.self, SmolLM3Model.init),
+        "ernie4_5": create(Ernie45Configuration.self, Ernie45Model.init),
+        "lfm2": create(LFM2Configuration.self, LFM2Model.init),
+        "baichuan_m1": create(BaichuanM1Configuration.self, BaichuanM1Model.init),
+        "exaone4": create(Exaone4Configuration.self, Exaone4Model.init),
+        "gpt_oss": create(GPTOSSConfiguration.self, GPTOSSModel.init),
+        "lille-130m": create(Lille130mConfiguration.self, Lille130mModel.init),
+        "olmoe": create(OlmoEConfiguration.self, OlmoEModel.init),
+        "olmo2": create(Olmo2Configuration.self, Olmo2Model.init),
+        "olmo3": create(Olmo3Configuration.self, Olmo3Model.init),
+        "bailing_moe": create(BailingMoeConfiguration.self, BailingMoeModel.init),
+        "lfm2_moe": create(LFM2MoEConfiguration.self, LFM2MoEModel.init),
+        "nanochat": create(NanoChatConfiguration.self, NanoChatModel.init),
+        "afmoe": create(AfMoEConfiguration.self, AfMoEModel.init),
+        "jamba_3b": create(JambaConfiguration.self, JambaModel.init),
+        "mistral3": create(Mistral3TextConfiguration.self, Mistral3TextModel.init),
+    ])
 }
 
 /// Registry of models and any overrides that go with them, e.g. prompt augmentation.
@@ -458,7 +453,7 @@ private struct LLMUserInputProcessor: UserInputProcessor {
 /// let modelContainer = try await LLMModelFactory.shared.loadContainer(
 ///     configuration: LLMRegistry.llama3_8B_4bit)
 /// ```
-public class LLMModelFactory: ModelFactory {
+public final class LLMModelFactory: ModelFactory {
 
     public init(typeRegistry: ModelTypeRegistry, modelRegistry: AbstractModelRegistry) {
         self.typeRegistry = typeRegistry
@@ -478,7 +473,7 @@ public class LLMModelFactory: ModelFactory {
     public func _load(
         hub: HubApi, configuration: ModelConfiguration,
         progressHandler: @Sendable @escaping (Progress) -> Void
-    ) async throws -> sending ModelContext {
+    ) async throws -> ModelContext {
         // download weights and config
         let modelDirectory = try await downloadModel(
             hub: hub, configuration: configuration, progressHandler: progressHandler)
@@ -497,7 +492,7 @@ public class LLMModelFactory: ModelFactory {
 
         let model: LanguageModel
         do {
-            model = try typeRegistry.createModel(
+            model = try await typeRegistry.createModel(
                 configuration: configurationURL, modelType: baseConfig.modelType)
         } catch let error as DecodingError {
             throw ModelFactoryError.configurationDecodingError(
 
@@ -498,14 +498,13 @@ private class AfMoEModelInner: Module {
         }
 
         // Create attention masks
-        let faCache: [KVCache]? = layerCache.map { [$0[faIdx]] }
-        let faMask = createAttentionMask(h: h, cache: faCache)
+        let faMask = createAttentionMask(h: h, cache: layerCache?[faIdx])
 
         var swaMask: MLXFast.ScaledDotProductAttentionMaskMode = .none
         if let swaIdx = swaIdx, let layerCache = layerCache {
-            let swaCache = [layerCache[swaIdx]]
             // Create mask with sliding window
-            swaMask = createSlidingWindowMask(h: h, cache: swaCache, windowSize: slidingWindow)
+            swaMask = createAttentionMask(
+                h: h, cache: layerCache[swaIdx], windowSize: slidingWindow)
         }
 
         for (i, layer) in layers.enumerated() {
@@ -515,25 +514,6 @@ private class AfMoEModelInner: Module {
 
         return norm(h)
     }
-
-    // Helper to create sliding window mask
-    private func createSlidingWindowMask(
-        h: MLXArray, cache: [KVCache]?, windowSize: Int
-    ) -> MLXFast.ScaledDotProductAttentionMaskMode {
-        let t = h.dim(1)
-        if t > 1 {
-            var offset = 0
-            if let c = cache?.first {
-                offset = c.offset
-                if let maxSize = c.maxSize {
-                    offset = min(maxSize, offset)
-                }
-            }
-            let mask = createCausalMask(n: t, offset: offset, windowSize: windowSize)
-            return .array(mask)
-        }
-        return .none
-    }
 }
 
 // MARK: - AfMoE Model (Public)
 
@@ -212,7 +212,7 @@ private class BailingMoeGate: Module, UnaryLayer {
     }
 
     func groupSelect(_ x: MLXArray) -> (inds: MLXArray, scores: MLXArray) {
-        let (bsz, seqLen, h) = (x.dim(0), x.dim(1), x.dim(2))
+        let (bsz, seqLen, _) = (x.dim(0), x.dim(1), x.dim(2))
 
         let logits = gate(x)
         var scores = sigmoid(logits.asType(.float32))
@@ -221,14 +221,14 @@ private class BailingMoeGate: Module, UnaryLayer {
 
         let topKGroup = top(groupScores, k: 2, axis: -1).sum(axis: -1, keepDims: true)
         var k = nGroup - topkGroup
-        var groupIdx = argPartition(topKGroup, kth: k - 1, axis: -2)[.ellipsis, ..<k, 0...]
+        let groupIdx = argPartition(topKGroup, kth: k - 1, axis: -2)[.ellipsis, ..<k, 0...]
         scores = putAlong(groupScores, groupIdx, values: MLXArray(0.0), axis: -2)
         scores = flattened(scores, start: -2, end: -1)
 
         k = topK
         let inds = argPartition(-scores, kth: k - 1, axis: -1)[.ellipsis, ..<k]
         scores = takeAlong(scores, inds, axis: -1)
-        if topK ?? 1 > 1, normTopkProb {
+        if topK > 1, normTopkProb {
             let denominator = scores.sum(axis: -1, keepDims: true) + 1e-20
             scores = scores / denominator
         }
 
@@ -3,7 +3,6 @@
 import Foundation
 import MLX
 import MLXFast
-import MLXLLM
 import MLXLMCommon
 import MLXNN
 
@@ -354,7 +353,7 @@ private class MoEGate: Module {
     }
 
     func callAsFunction(_ x: MLXArray) -> (MLXArray, MLXArray) {
-        let (bsz, seqLen, h) = (x.dim(0), x.dim(1), x.dim(2))
+        let (bsz, seqLen, _) = (x.dim(0), x.dim(1), x.dim(2))
 
         let hiddenStates = x.matmul(weight.T)
         var scores = sigmoid(hiddenStates)
 
@@ -10,7 +10,6 @@
 import Foundation
 import MLX
 import MLXFast
-import MLXLLM
 import MLXLMCommon
 import MLXNN
 
 
@@ -121,7 +121,6 @@ private class TransformerBlock: Module {
     let residualMultiplier: Float
 
     public init(_ args: GraniteConfiguration) {
-        let attentionHeads = args.attentionHeads
         let hiddenSize = args.hiddenSize
 
         self._attention.wrappedValue = Attention(args)
@@ -271,7 +270,7 @@ public struct GraniteConfiguration: Codable, Sendable {
         self.maxPositionEmbeddings = try container.decode(Int.self, forKey: .maxPositionEmbeddings)
         self.kvHeads = try container.decode(Int.self, forKey: .kvHeads)
         self.attentionBias = try container.decode(Bool.self, forKey: .attentionBias)
-        self.mlpBias = try container.decode(Bool.self, forKey: .mlpBias) ?? false
+        self.mlpBias = try container.decode(Bool.self, forKey: .mlpBias)
         self.ropeTheta = try container.decodeIfPresent(Float.self, forKey: .ropeTheta) ?? 10000000.0
         self.ropeScaling = try container.decodeIfPresent(
             [String: StringOrNumber].self, forKey: .ropeScaling)
 
@@ -119,7 +119,7 @@ private class GraniteMoeHybridMamba2Mixer: Module {
             }
         }
 
-        var padded = concatenated([convState!, input], axis: 1)
+        let padded = concatenated([convState!, input], axis: 1)
 
         if let cache {
             let end = padded.dim(1)
@@ -136,12 +136,12 @@ private class GraniteMoeHybridMamba2Mixer: Module {
         mask: MLXArray?,
         cache: MambaCache?
     ) -> MLXArray {
-        var projected = inProj(hiddenStates)
+        let projected = inProj(hiddenStates)
         let splits = split(
             projected, indices: [intermediateSize, intermediateSize + convDim], axis: -1)
-        var gate = splits[0]
+        let gate = splits[0]
         var convInput = splits[1]
-        var dt = splits[2]
+        let dt = splits[2]
 
         if let mask {
             let expandedMask = expandedDimensions(mask, axis: -1)
@@ -551,7 +551,7 @@ public class GraniteMoeHybridModel: Module, LLMModel, KVCacheDimensionProvider {
             for layerIndex in 0 ..< configuration.hiddenLayers {
                 let prefix = "model.layers.\(layerIndex).block_sparse_moe"
                 guard
-                    var inputWeight = sanitized.removeValue(forKey: "\(prefix).input_linear.weight")
+                    let inputWeight = sanitized.removeValue(forKey: "\(prefix).input_linear.weight")
                 else { continue }
 
                 let expertHidden = inputWeight.dim(1)
Original file line number	Diff line number	Diff line change
`@@ -91,13 +91,6 @@ func loadSynchronous(modelDirectory: URL) throws -> EmbeddingModel {`
`91`	`91`	`}`
`92`	`92`	`}`
`93`	`93`
`94`		`- if let quantization = baseConfig.quantization {`
`95`		`- quantize(model: model, groupSize: quantization.groupSize, bits: quantization.bits) {`
`96`		`- path, module in`
`97`		`- weights["\(path).scales"] != nil`
`98`		`- }`
`99`		`- }`
`100`		`-`
`101`	`94`	`// apply the loaded weights`
`102`	`95`	`let parameters = ModuleParameters.unflattened(weights)`
`103`	`96`	`try model.update(parameters: parameters, verify: [.all])`
Original file line number	Diff line number	Diff line change
`@@ -24,12 +24,11 @@ extension LLMModel {`
`24`	`24`	`{`
`25`	`25`	`let prefillStepSize = windowSize ?? 512`
`26`	`26`	`var y = input.text`
`27`		`- var state: LMOutput.State? = nil`
`28`	`27`
`29`		`- // prepare the prompt in chunks if larger than the prefill size`
	`28`	`+ // Prepare the prompt in chunks if larger than the prefill size`
`30`	`29`	`while y.tokens.size > prefillStepSize {`
`31`	`30`	`let input = y[.newAxis, ..<prefillStepSize]`
`32`		`- let result = self(input, cache: cache.isEmpty ? nil : cache, state: state)`
	`31`	`+ _ = self(input, cache: cache.isEmpty ? nil : cache, state: nil)`
`33`	`32`	`eval(cache)`
`34`	`33`	`y = y[prefillStepSize...]`
`35`	`34`	`}`