ml-explore
diff --git a/‎Libraries/MLXLLM/Models/Gemma3Text.swift‎
Lines changed: 66 additions & 48 deletions b/‎Libraries/MLXLLM/Models/Gemma3Text.swift‎
Lines changed: 66 additions & 48 deletions
diff --git a/‎Libraries/MLXLLM/Models/Mistral3Text.swift‎
Lines changed: 3 additions & 16 deletions b/‎Libraries/MLXLLM/Models/Mistral3Text.swift‎
Lines changed: 3 additions & 16 deletions
diff --git a/‎Libraries/MLXLLM/Models/Olmo3.swift‎
Lines changed: 5 additions & 16 deletions b/‎Libraries/MLXLLM/Models/Olmo3.swift‎
Lines changed: 5 additions & 16 deletions
diff --git a/‎Libraries/MLXLMCommon/AttentionUtils.swift‎
Lines changed: 2 additions & 0 deletions b/‎Libraries/MLXLMCommon/AttentionUtils.swift‎
Lines changed: 2 additions & 0 deletions
@@ -23,12 +23,40 @@ public struct Gemma3TextConfiguration: Codable {
     let rmsNormEps: Float
     let vocabularySize: Int
     let kvHeads: Int
-    let ropeGlobalBaseFreq: Float
+    let ropeTheta: Float
     let ropeLocalBaseFreq: Float
     let ropeTraditional: Bool
     let queryPreAttnScalar: Float
     let slidingWindow: Int
     let slidingWindowPattern: Int
+    let maxPositionEmbeddings: Int
+    let ropeScaling: [String: StringOrNumber]?
+
+    public init(
+        modelType: String, hiddenSize: Int, hiddenLayers: Int, intermediateSize: Int,
+        attentionHeads: Int, headDim: Int, rmsNormEps: Float, vocabularySize: Int, kvHeads: Int,
+        ropeTheta: Float, ropeLocalBaseFreq: Float, ropeTraditional: Bool,
+        queryPreAttnScalar: Float, slidingWindow: Int, slidingWindowPattern: Int,
+        maxPositionEmbeddings: Int, ropeScaling: [String: StringOrNumber]? = nil
+    ) {
+        self.modelType = modelType
+        self.hiddenSize = hiddenSize
+        self.hiddenLayers = hiddenLayers
+        self.intermediateSize = intermediateSize
+        self.attentionHeads = attentionHeads
+        self.headDim = headDim
+        self.rmsNormEps = rmsNormEps
+        self.vocabularySize = vocabularySize
+        self.kvHeads = kvHeads
+        self.ropeTheta = ropeTheta
+        self.ropeLocalBaseFreq = ropeLocalBaseFreq
+        self.ropeTraditional = ropeTraditional
+        self.queryPreAttnScalar = queryPreAttnScalar
+        self.slidingWindow = slidingWindow
+        self.slidingWindowPattern = slidingWindowPattern
+        self.maxPositionEmbeddings = maxPositionEmbeddings
+        self.ropeScaling = ropeScaling
+    }
 
     enum CodingKeys: String, CodingKey {
         case modelType = "model_type"
@@ -40,12 +68,14 @@ public struct Gemma3TextConfiguration: Codable {
         case rmsNormEps = "rms_norm_eps"
         case vocabularySize = "vocab_size"
         case kvHeads = "num_key_value_heads"
-        case ropeGlobalBaseFreq = "rope_global_base_freq"
+        case ropeTheta = "rope_theta"
         case ropeLocalBaseFreq = "rope_local_base_freq"
         case ropeTraditional = "rope_traditional"
         case queryPreAttnScalar = "query_pre_attn_scalar"
         case slidingWindow = "sliding_window"
         case slidingWindowPattern = "sliding_window_pattern"
+        case maxPositionEmbeddings = "max_position_embeddings"
+        case ropeScaling = "rope_scaling"
     }
 
     enum VLMCodingKeys: String, CodingKey {
@@ -65,16 +95,17 @@ public struct Gemma3TextConfiguration: Codable {
             }
 
         modelType = try container.decode(String.self, forKey: .modelType)
-        hiddenSize = try container.decode(Int.self, forKey: .hiddenSize)
-        hiddenLayers = try container.decode(Int.self, forKey: .hiddenLayers)
-        intermediateSize = try container.decode(Int.self, forKey: .intermediateSize)
+        hiddenSize = try container.decodeIfPresent(Int.self, forKey: .hiddenSize) ?? 1152
+        hiddenLayers = try container.decodeIfPresent(Int.self, forKey: .hiddenLayers) ?? 26
+        intermediateSize =
+            try container.decodeIfPresent(Int.self, forKey: .intermediateSize) ?? 6912
         attentionHeads = try container.decodeIfPresent(Int.self, forKey: .attentionHeads) ?? 4
         headDim = try container.decodeIfPresent(Int.self, forKey: .headDim) ?? 256
         rmsNormEps = try container.decodeIfPresent(Float.self, forKey: .rmsNormEps) ?? 1.0e-6
         vocabularySize = try container.decodeIfPresent(Int.self, forKey: .vocabularySize) ?? 262144
         kvHeads = try container.decodeIfPresent(Int.self, forKey: .kvHeads) ?? 1
-        ropeGlobalBaseFreq =
-            try container.decodeIfPresent(Float.self, forKey: .ropeGlobalBaseFreq) ?? 1_000_000.0
+        ropeTheta =
+            try container.decodeIfPresent(Float.self, forKey: .ropeTheta) ?? 1_000_000.0
         ropeLocalBaseFreq =
             try container.decodeIfPresent(Float.self, forKey: .ropeLocalBaseFreq) ?? 10_000.0
         ropeTraditional =
@@ -84,6 +115,10 @@ public struct Gemma3TextConfiguration: Codable {
         slidingWindow = try container.decodeIfPresent(Int.self, forKey: .slidingWindow) ?? 512
         slidingWindowPattern =
             try container.decodeIfPresent(Int.self, forKey: .slidingWindowPattern) ?? 6
+        maxPositionEmbeddings =
+            try container.decodeIfPresent(Int.self, forKey: .maxPositionEmbeddings) ?? 32768
+        ropeScaling =
+            try container.decodeIfPresent([String: StringOrNumber].self, forKey: .ropeScaling)
     }
 }
 
@@ -106,7 +141,7 @@ private class Attention: Module {
     @ModuleInfo(key: "q_norm") var queryNorm: Gemma.RMSNorm
     @ModuleInfo(key: "k_norm") var keyNorm: Gemma.RMSNorm
 
-    @ModuleInfo var rope: RoPE
+    @ModuleInfo var rope: OffsetLayer
 
     init(_ config: Gemma3TextConfiguration, layerIdx: Int) {
         let dim = config.hiddenSize
@@ -131,12 +166,16 @@ private class Attention: Module {
 
         self.isSliding = (layerIdx + 1) % config.slidingWindowPattern != 0
 
-        let baseFreq = isSliding ? config.ropeLocalBaseFreq : config.ropeGlobalBaseFreq
-        self._rope.wrappedValue = RoPE(
-            dimensions: headDim,
-            traditional: config.ropeTraditional,
-            base: baseFreq
-        )
+        if isSliding {
+            self.rope = initializeRope(
+                dims: headDim, base: config.ropeLocalBaseFreq, traditional: false,
+                scalingConfig: nil, maxPositionEmbeddings: nil)
+        } else {
+            self.rope = initializeRope(
+                dims: headDim, base: config.ropeTheta, traditional: false,
+                scalingConfig: config.ropeScaling,
+                maxPositionEmbeddings: config.maxPositionEmbeddings)
+        }
 
         super.init()
     }
@@ -163,18 +202,8 @@ private class Attention: Module {
             queries = rope(queries, offset: cache.offset)
             keys = rope(keys, offset: cache.offset)
         } else {
-            queries = rope(queries)
-            keys = rope(keys)
-        }
-
-        // Sliding window masking
-        var finalMask = mask
-        if case .array(let maskArray) = mask {
-            let keySeqLen = keys.shape[2]
-            if maskArray.shape.last! != keySeqLen {
-                let slicedMask = maskArray[.ellipsis, (-keySeqLen)...]
-                finalMask = .array(slicedMask)
-            }
+            queries = rope(queries, offset: 0)
+            keys = rope(keys, offset: 0)
         }
 
         let output = attentionWithCacheUpdate(
@@ -183,7 +212,7 @@ private class Attention: Module {
             values: values,
             cache: cache,
             scale: scale,
-            mask: finalMask
+            mask: mask
         )
         .transposed(0, 2, 1, 3)
         .reshaped(B, L, -1)
@@ -295,30 +324,19 @@ private class Gemma3Model: Module {
         if layerCache == nil {
             layerCache = Array(repeating: nil as KVCache?, count: layers.count)
         }
-        // Create attention masks
-        var fullMask: MLXFast.ScaledDotProductAttentionMaskMode = .none
-        var slidingWindowMask: MLXFast.ScaledDotProductAttentionMaskMode = .none
-        if mask == nil {
-            let j = config.slidingWindowPattern
-            let globalCache: KVCache? =
-                (j > 0 && j <= (layerCache?.count ?? 0)) ? layerCache?[j - 1] : nil
-            fullMask = createAttentionMask(h: h, cache: globalCache)
-            let slidingCache: KVCache? = layerCache?.first ?? nil
-            slidingWindowMask = createAttentionMask(
-                h: h, cache: slidingCache, windowSize: config.slidingWindow)
-        }
-        for (i, layer) in layers.enumerated() {
-            let isGlobal = (i % config.slidingWindowPattern == config.slidingWindowPattern - 1)
 
-            let localMask: MLXFast.ScaledDotProductAttentionMaskMode
-            if let mask {
-                localMask = mask
-            } else if isGlobal {
-                localMask = fullMask
+        let globalMask = createAttentionMask(h: h, cache: cache?[config.slidingWindowPattern - 1])
+        let slidingWindowMask =
+            if config.slidingWindowPattern > 1 {
+                createAttentionMask(h: h, cache: cache?[0], windowSize: config.slidingWindow)
             } else {
-                localMask = slidingWindowMask
+                MLXFast.ScaledDotProductAttentionMaskMode.none
             }
-            h = layer(h, mask: localMask, cache: layerCache?[i])
+
+        for (i, layer) in layers.enumerated() {
+            let isGlobal = (i % config.slidingWindowPattern == config.slidingWindowPattern - 1)
+            let mask = isGlobal ? globalMask : slidingWindowMask
+            h = layer(h, mask: mask, cache: layerCache?[i])
         }
         return norm(h)
     }
 
@@ -42,7 +42,7 @@ private class Attention: Module {
     @ModuleInfo(key: "v_proj") var wv: Linear
     @ModuleInfo(key: "o_proj") var wo: Linear
 
-    let rope: Module
+    let rope: OffsetLayer
 
     init(_ args: Mistral3TextConfiguration) {
         self.args = args
@@ -76,19 +76,6 @@ private class Attention: Module {
         super.init()
     }
 
-    private func applyRoPE(_ x: MLXArray, offset: Int) -> MLXArray {
-        if let ropeModule = rope as? RoPE {
-            return ropeModule(x, offset: offset)
-        } else if let llama3Rope = rope as? Llama3RoPE {
-            return llama3Rope(x, offset: offset)
-        } else if let yarnRope = rope as? YarnRoPE {
-            return yarnRope(x, offset: offset)
-        } else if let suScaledRope = rope as? SuScaledRoPE {
-            return suScaledRope(x, offset: offset)
-        }
-        return x
-    }
-
     func callAsFunction(
         _ x: MLXArray, attnScale: MLXArray, mask: MLXFast.ScaledDotProductAttentionMaskMode,
         cache: KVCache?
@@ -106,8 +93,8 @@ private class Attention: Module {
 
         // Apply RoPE
         let offset = cache?.offset ?? 0
-        queries = applyRoPE(queries, offset: offset)
-        keys = applyRoPE(keys, offset: offset)
+        queries = rope(queries, offset: offset)
+        keys = rope(keys, offset: offset)
 
         // Apply attention scaling
         queries = queries * attnScale
 
@@ -29,7 +29,7 @@ private class Attention: Module {
     @ModuleInfo(key: "q_norm") var qNorm: RMSNorm
     @ModuleInfo(key: "k_norm") var kNorm: RMSNorm
 
-    let rope: Module
+    let rope: OffsetLayer
 
     init(_ args: Olmo3Configuration, layerIdx: Int) {
         self.args = args
@@ -65,17 +65,6 @@ private class Attention: Module {
         super.init()
     }
 
-    private func applyRoPE(_ x: MLXArray, offset: Int?) -> MLXArray {
-        if let llama3Rope = rope as? Llama3RoPE {
-            return llama3Rope(x, offset: offset ?? 0)
-        } else if let yarnRope = rope as? YarnRoPE {
-            return yarnRope(x, offset: offset ?? 0)
-        } else if let basicRope = rope as? RoPE {
-            return basicRope(x, offset: offset ?? 0)
-        }
-        return x
-    }
-
     func callAsFunction(
         _ x: MLXArray, mask: MLXFast.ScaledDotProductAttentionMaskMode, cache: KVCache?
     ) -> MLXArray {
@@ -90,11 +79,11 @@ private class Attention: Module {
         values = values.reshaped(B, L, nKVHeads, -1).transposed(0, 2, 1, 3)
 
         if let cache {
-            queries = applyRoPE(queries, offset: cache.offset)
-            keys = applyRoPE(keys, offset: cache.offset)
+            queries = rope(queries, offset: cache.offset)
+            keys = rope(keys, offset: cache.offset)
         } else {
-            queries = applyRoPE(queries, offset: nil)
-            keys = applyRoPE(keys, offset: nil)
+            queries = rope(queries, offset: 0)
+            keys = rope(keys, offset: 0)
         }
 
         let output = attentionWithCacheUpdate(
 
@@ -67,6 +67,8 @@ public func attentionWithCacheUpdate(
         )
     } else {
         let (cachedKeys, cachedValues) = cache.update(keys: keys, values: values)
+        // TODO dkoski
+        //        print("\(cachedKeys.shape) \(cachedValues.shape) \(queries.shape), \(mask.masks?[0].shape ?? [])")
         return MLXFast.scaledDotProductAttention(
             queries: queries,
             keys: cachedKeys,
Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,8 @@ public func attentionWithCacheUpdate(`
`67`	`67`	`)`
`68`	`68`	`} else {`
`69`	`69`	`let (cachedKeys, cachedValues) = cache.update(keys: keys, values: values)`
	`70`	`+ // TODO dkoski`
	`71`	`+ // print("\(cachedKeys.shape) \(cachedValues.shape) \(queries.shape), \(mask.masks?[0].shape ?? [])")`
`70`	`72`	`return MLXFast.scaledDotProductAttention(`
`71`	`73`	`queries: queries,`
`72`	`74`	`keys: cachedKeys,`