Skip to content

Commit ee7ecdc

Browse files
authored
Improve --add-package duplicate detection (#12239)
* optimize name matches * changelog * Apply suggestion from @emmyoop
1 parent d74b58a commit ee7ecdc

File tree

3 files changed

+166
-15
lines changed

3 files changed

+166
-15
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
kind: Fixes
2+
body: ':bug: :snowman: Improve `dbt deps --add-package` duplicate detection with better
3+
cross-source matching and word boundaries'
4+
time: 2025-11-28T16:31:44.344099-05:00
5+
custom:
6+
Author: emmyoop
7+
Issue: "12239"

core/dbt/task/deps.py

Lines changed: 66 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -123,15 +123,45 @@ def check_for_duplicate_packages(self, packages_yml):
123123
124124
This method is called only during `dbt deps --add-package` to check if the package
125125
being added already exists in packages.yml. It uses substring matching to identify
126-
duplicates, checking if the package name appears within package identifiers (such as
127-
within git URLs, hub package names, or local paths).
126+
duplicates, which means it will match across different package sources. For example,
127+
adding a hub package "dbt-labs/dbt_utils" will remove an existing git package
128+
"https://github.com/dbt-labs/dbt-utils.git" since both contain "dbt_utils" or "dbt-utils".
129+
130+
The matching is flexible to handle both underscore and hyphen variants of package names,
131+
as git repos often use hyphens (dbt-utils) while package names use underscores (dbt_utils).
132+
Word boundaries (/, .) are enforced to prevent false matches like "dbt-core" matching
133+
"dbt-core-utils".
128134
129135
Args:
130136
packages_yml (dict): In-memory read of `packages.yml` contents
131137
132138
Returns:
133139
dict: Updated packages_yml contents with matching packages removed
134140
"""
141+
# Extract the package name for matching
142+
package_name = self.args.add_package["name"]
143+
144+
# Create variants for flexible matching (handle _ vs -)
145+
# Check multiple variants to handle naming inconsistencies between hub and git
146+
package_name_parts = [
147+
package_name, # Original: "dbt-labs/dbt_utils"
148+
package_name.replace("_", "-"), # Hyphens: "dbt-labs/dbt-utils"
149+
package_name.replace("-", "_"), # Underscores: "dbt_labs/dbt_utils"
150+
]
151+
# Extract just the package name without org (after last /)
152+
if "/" in package_name:
153+
short_name = package_name.split("/")[-1]
154+
package_name_parts.extend(
155+
[
156+
short_name, # "dbt_utils"
157+
short_name.replace("_", "-"), # "dbt-utils"
158+
short_name.replace("-", "_"), # "dbt_utils" (deduplicated)
159+
]
160+
)
161+
162+
# Remove duplicates from package_name_parts
163+
package_name_parts = list(set(package_name_parts))
164+
135165
# Iterate backwards to safely delete items without index shifting issues
136166
for i in range(len(packages_yml["packages"]) - 1, -1, -1):
137167
pkg_entry = packages_yml["packages"][i]
@@ -146,19 +176,40 @@ def check_for_duplicate_packages(self, packages_yml):
146176
or pkg_entry.get("private") # private package
147177
)
148178

149-
# Check if package name appears in the identifier using substring match
150-
if package_identifier and self.args.add_package["name"] in package_identifier:
151-
del packages_yml["packages"][i]
152-
# Filter out non-string values (like warn-unpinned boolean) before logging
153-
# Note: Check for bool first since bool is a subclass of int in Python
154-
loggable_package = {
155-
k: v
156-
for k, v in pkg_entry.items()
157-
if not isinstance(v, bool)
158-
and isinstance(v, (str, int, float))
159-
and k != "unrendered"
160-
}
161-
fire_event(DepsFoundDuplicatePackage(removed_package=loggable_package))
179+
# Check if any variant of the package name appears in the identifier
180+
# Use word boundaries to avoid false matches (e.g., "dbt-core" shouldn't match "dbt-core-utils")
181+
# Word boundaries are: start/end of string, /, or .
182+
# Note: - and _ are NOT boundaries since they're used within compound package names
183+
if package_identifier:
184+
is_duplicate = False
185+
for name_variant in package_name_parts:
186+
if name_variant in package_identifier:
187+
# Found a match, now verify it's not a substring of a larger word
188+
# Check characters before and after the match
189+
idx = package_identifier.find(name_variant)
190+
start_ok = idx == 0 or package_identifier[idx - 1] in "/."
191+
end_idx = idx + len(name_variant)
192+
end_ok = (
193+
end_idx == len(package_identifier)
194+
or package_identifier[end_idx] in "/."
195+
)
196+
197+
if start_ok and end_ok:
198+
is_duplicate = True
199+
break
200+
201+
if is_duplicate:
202+
del packages_yml["packages"][i]
203+
# Filter out non-string values (like warn-unpinned boolean) before logging
204+
# Note: Check for bool first since bool is a subclass of int in Python
205+
loggable_package = {
206+
k: v
207+
for k, v in pkg_entry.items()
208+
if not isinstance(v, bool)
209+
and isinstance(v, (str, int, float))
210+
and k != "unrendered"
211+
}
212+
fire_event(DepsFoundDuplicatePackage(removed_package=loggable_package))
162213

163214
return packages_yml
164215

tests/unit/deps/test_deps.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,3 +1133,96 @@ def test_check_duplicate_multiple_matches(self):
11331133
self.assertIsNotNone(result)
11341134
self.assertEqual(len(result["packages"]), 1)
11351135
self.assertIn("dbt_amplitude", result["packages"][0]["git"])
1136+
1137+
def test_check_duplicate_underscore_hyphen_matching(self):
1138+
"""Test that underscore and hyphen variants match (dbt_utils matches dbt-utils)"""
1139+
# Adding hub package with underscore should match git package with hyphen
1140+
mock_args = Namespace(
1141+
add_package={"name": "dbt-labs/dbt_utils", "version": "1.0.0"}, source="hub"
1142+
)
1143+
1144+
with mock.patch("dbt.task.deps.BaseTask.__init__"):
1145+
task = DepsTask.__new__(DepsTask)
1146+
task.args = mock_args
1147+
1148+
packages_yml = {
1149+
"packages": [
1150+
{
1151+
"git": "https://github.com/dbt-labs/dbt-utils.git", # hyphen in URL
1152+
"revision": "1.0.0",
1153+
},
1154+
]
1155+
}
1156+
1157+
# Should match because "dbt-utils" variant matches the git URL
1158+
with mock.patch("dbt_common.events.functions.fire_event"):
1159+
result = task.check_for_duplicate_packages(packages_yml)
1160+
1161+
self.assertIsNotNone(result)
1162+
self.assertEqual(len(result["packages"]), 0) # Git package removed
1163+
1164+
def test_check_duplicate_no_partial_word_match(self):
1165+
"""Test that partial word matches are rejected (dbt-core shouldn't match dbt-core-utils)"""
1166+
mock_args = Namespace(
1167+
add_package={"name": "dbt-labs/dbt-core", "version": "1.0.0"}, source="hub"
1168+
)
1169+
1170+
with mock.patch("dbt.task.deps.BaseTask.__init__"):
1171+
task = DepsTask.__new__(DepsTask)
1172+
task.args = mock_args
1173+
1174+
packages_yml = {
1175+
"packages": [
1176+
{
1177+
"git": "https://github.com/dbt-labs/dbt-core-utils.git",
1178+
"revision": "1.0.0",
1179+
},
1180+
{
1181+
"package": "other-org/my-dbt-core-fork",
1182+
"version": "2.0.0",
1183+
},
1184+
]
1185+
}
1186+
1187+
# Should NOT match because "dbt-core" is part of a larger word
1188+
with mock.patch("dbt_common.events.functions.fire_event"):
1189+
result = task.check_for_duplicate_packages(packages_yml)
1190+
1191+
# Both packages should remain (no matches)
1192+
self.assertIsNotNone(result)
1193+
self.assertEqual(len(result["packages"]), 2)
1194+
1195+
def test_check_duplicate_exact_word_boundary_match(self):
1196+
"""Test that exact matches with word boundaries work correctly"""
1197+
mock_args = Namespace(
1198+
add_package={"name": "dbt-labs/dbt-utils", "version": "1.0.0"}, source="hub"
1199+
)
1200+
1201+
with mock.patch("dbt.task.deps.BaseTask.__init__"):
1202+
task = DepsTask.__new__(DepsTask)
1203+
task.args = mock_args
1204+
1205+
packages_yml = {
1206+
"packages": [
1207+
{
1208+
"git": "https://github.com/dbt-labs/dbt-utils.git", # Should match
1209+
"revision": "1.0.0",
1210+
},
1211+
{
1212+
"git": "https://github.com/other/dbt-utils-extra.git", # Should NOT match
1213+
"revision": "2.0.0",
1214+
},
1215+
{
1216+
"package": "dbt-labs/dbt_utils", # Should match (underscore variant)
1217+
"version": "0.9.0",
1218+
},
1219+
]
1220+
}
1221+
1222+
with mock.patch("dbt_common.events.functions.fire_event"):
1223+
result = task.check_for_duplicate_packages(packages_yml)
1224+
1225+
# Only dbt-utils-extra should remain
1226+
self.assertIsNotNone(result)
1227+
self.assertEqual(len(result["packages"]), 1)
1228+
self.assertIn("dbt-utils-extra", result["packages"][0]["git"])

0 commit comments

Comments
 (0)