-
Notifications
You must be signed in to change notification settings - Fork 453
Fix library driver library resolution in Alpine containers #1530
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ import ( | |
| "bufio" | ||
| "flag" | ||
| "fmt" | ||
| "io" | ||
| "os" | ||
| "os/exec" | ||
| "path/filepath" | ||
|
|
@@ -168,6 +169,7 @@ func (l *Ldconfig) UpdateLDCache() error { | |
| return fmt.Errorf("failed to write %s drop-in: %w", ldsoconfdFilenamePattern, err) | ||
| } | ||
|
|
||
| systemSearchPaths := l.getSystemSearchPaths() | ||
| // In most cases, the hook will be executing a host ldconfig that may be configured widely | ||
| // differently from what the container image expects. | ||
| // The common case is Debian-like (e.g. Debian, Ubuntu) vs non-Debian-like (e.g. RHEL, Fedora). | ||
|
|
@@ -176,10 +178,15 @@ func (l *Ldconfig) UpdateLDCache() error { | |
| // paths to a drop-in conf file that is likely to be last in lexicographic order. Entries in the | ||
| // top-level ld.so.conf file may be processed after this drop-in, but this hook does not modify | ||
| // the top-level file if it exists. | ||
| if err := createLdsoconfdFile(defaultLdsoconfdDir, ldsoconfdSystemDirsFilenamePattern, l.getSystemSearchPaths()...); err != nil { | ||
| if err := createLdsoconfdFile(defaultLdsoconfdDir, ldsoconfdSystemDirsFilenamePattern, systemSearchPaths...); err != nil { | ||
| return fmt.Errorf("failed to write %s drop-in: %w", ldsoconfdSystemDirsFilenamePattern, err) | ||
| } | ||
|
|
||
| // Also output the folders to the alpine .path file as required. | ||
| if err := createAlpinePathFileIfRequired(append(filteredDirectories, systemSearchPaths...)...); err != nil { | ||
| return err | ||
| } | ||
|
|
||
| return SafeExec(ldconfigPath, args, nil) | ||
| } | ||
|
|
||
|
|
@@ -246,23 +253,30 @@ func createLdsoconfdFile(ldsoconfdDir, pattern string, dirs ...string) error { | |
| _ = configFile.Close() | ||
| }() | ||
|
|
||
| if err := outputListToFile(configFile, dirs...); err != nil { | ||
| return err | ||
| } | ||
|
|
||
| // The created file needs to be world readable for the cases where the container is run as a non-root user. | ||
| if err := configFile.Chmod(0644); err != nil { | ||
| return fmt.Errorf("failed to chmod config file: %w", err) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| func outputListToFile(w io.Writer, dirs ...string) error { | ||
| added := make(map[string]bool) | ||
| for _, dir := range dirs { | ||
| if added[dir] { | ||
| continue | ||
| } | ||
| _, err := fmt.Fprintf(configFile, "%s\n", dir) | ||
| _, err := fmt.Fprintf(w, "%s\n", dir) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to update config file: %w", err) | ||
| } | ||
| added[dir] = true | ||
| } | ||
|
|
||
| // The created file needs to be world readable for the cases where the container is run as a non-root user. | ||
| if err := configFile.Chmod(0644); err != nil { | ||
| return fmt.Errorf("failed to chmod config file: %w", err) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
|
|
@@ -360,6 +374,38 @@ func processLdsoconfFile(ldsoconfFilename string) ([]string, []string, error) { | |
| return directories, includedFilenames, nil | ||
| } | ||
|
|
||
| func createAlpinePathFileIfRequired(dirs ...string) error { | ||
| if !isAlpine() { | ||
| return nil | ||
| } | ||
|
|
||
| var pathFileName string | ||
| switch runtime.GOARCH { | ||
| case "amd64": | ||
| pathFileName = "/etc/ld-musl-x86_64.path" | ||
| case "arm64": | ||
| pathFileName = "/etc/ld-musl-aarch64.path" | ||
| } | ||
|
|
||
| pathFile, err := os.OpenFile(pathFileName, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0644) | ||
| if err != nil { | ||
| return fmt.Errorf("could not open .path file: %w", err) | ||
| } | ||
| defer func() { | ||
| _ = pathFile.Close() | ||
| }() | ||
|
|
||
| return outputListToFile(pathFile, dirs...) | ||
| } | ||
|
|
||
| func isAlpine() bool { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potentially out-of-scope -- is there an easy way of detecting if musl is the standard C library used in a distro? If yes, would we be open to expanding this change to be distro-agnostic? Meaning we would create the A quick LLM query suggested the below: |
||
| info, err := os.Stat("/etc/alpine-release") | ||
| if err != nil { | ||
| return false | ||
| } | ||
| return !info.IsDir() | ||
| } | ||
|
|
||
| // isDebianLike returns true if a Debian-like distribution is detected. | ||
| // Debian-like distributions include Debian and Ubuntu, whereas non-Debian-like | ||
| // distributions include RHEL and Fedora. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,7 @@ import ( | |
| var _ = Describe("docker", Ordered, ContinueOnFailure, func() { | ||
| var hostDriverVersion string | ||
| var hostDriverMajor string | ||
| var hostOutput string | ||
|
|
||
| // Install the NVIDIA Container Toolkit | ||
| BeforeAll(func(ctx context.Context) { | ||
|
|
@@ -41,20 +42,17 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { | |
| hostDriverVersion = strings.TrimSpace(parts[1]) | ||
| Expect(hostDriverVersion).ToNot(BeEmpty()) | ||
| hostDriverMajor = strings.SplitN(hostDriverVersion, ".", 2)[0] | ||
|
|
||
| hostOutput, _, err = runner.Run("nvidia-smi -L") | ||
| Expect(err).ToNot(HaveOccurred()) | ||
| }) | ||
|
|
||
| // GPUs are accessible in a container: Running nvidia-smi -L inside the | ||
| // container shows the same output inside the container as outside the | ||
| // container. This means that the following commands must all produce | ||
| // the same output | ||
| When("running nvidia-smi -L", Ordered, func() { | ||
| var hostOutput string | ||
| var err error | ||
|
|
||
| BeforeAll(func(ctx context.Context) { | ||
| hostOutput, _, err = runner.Run("nvidia-smi -L") | ||
| Expect(err).ToNot(HaveOccurred()) | ||
|
|
||
| _, _, err := runner.Run("docker pull ubuntu") | ||
| Expect(err).ToNot(HaveOccurred()) | ||
| }) | ||
|
|
@@ -589,4 +587,30 @@ EOF`) | |
| Expect(output).To(Equal(expectedOutput)) | ||
| }) | ||
| }) | ||
|
|
||
| When("running an alpine container", Ordered, func() { | ||
| BeforeAll(func(ctx context.Context) { | ||
| _, _, err := runner.Run(`docker build -t alpinecompat \ | ||
| - <<EOF | ||
| FROM alpine | ||
| ENV NVIDIA_VISIBLE_DEVICES=all | ||
| RUN apk add --no-cache gcompat | ||
| EOF`) | ||
| Expect(err).ToNot(HaveOccurred()) | ||
| }) | ||
|
|
||
| It("nvidia-smi should succeed when using the nvidia-container-runtime", func(ctx context.Context) { | ||
| output, _, err := runner.Run(`docker run --rm --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -e NVIDIA_DRIVER_CAPABILITIES=all \ | ||
| alpinecompat nvidia-smi -L`) | ||
| Expect(err).ToNot(HaveOccurred()) | ||
| Expect(output).To(Equal(hostOutput)) | ||
| }) | ||
|
|
||
| It("nvidia-smi should succeed when using the nvidia-container-runtime-hook", Label("legacy"), func(ctx context.Context) { | ||
| output, _, err := runner.Run(`docker run --rm --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -e NVIDIA_DRIVER_CAPABILITIES=all \ | ||
| alpinecompat nvidia-smi -L`) | ||
| Expect(err).ToNot(HaveOccurred()) | ||
| Expect(output).To(Equal(hostOutput)) | ||
| }) | ||
|
Comment on lines
+609
to
+614
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question -- was this test case passing before the relevant changes in this PR? If so, what is libnvidia-container doing to handle containers that use musl instead of glibc? |
||
| }) | ||
| }) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's add a more descriptive error message here.