Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,13 @@ All notable changes to this project will be documented in this file.
See [objectOverrides concepts page](https://docs.stackable.tech/home/nightly/concepts/overrides/#object-overrides) for details ([#741]).
- Enable the [restart-controller](https://docs.stackable.tech/home/nightly/commons-operator/restarter/), so that the Pods are automatically restarted on config changes ([#743]).

### Fixed

- Previosly, some shell output of init-containers was not logged properly and therefore not aggregated, which is fixed now ([#746]).

[#741]: https://github.com/stackabletech/hdfs-operator/pull/741
[#743]: https://github.com/stackabletech/hdfs-operator/pull/743
[#746]: https://github.com/stackabletech/hdfs-operator/pull/746

## [25.11.0] - 2025-11-07

Expand Down
130 changes: 90 additions & 40 deletions rust/operator-binary/src/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ use stackable_operator::{
LoggingError, create_vector_shutdown_file_command, remove_vector_shutdown_file_command,
},
spec::{
ConfigMapLogConfig, ContainerLogConfig, ContainerLogConfigChoice,
CustomContainerLogConfig,
AutomaticContainerLogConfig, ConfigMapLogConfig, ContainerLogConfig,
ContainerLogConfigChoice, CustomContainerLogConfig,
},
},
role_utils::RoleGroupRef,
Expand Down Expand Up @@ -627,22 +627,22 @@ impl ContainerConfig {
&merged_config.hdfs_logging(),
));

args.push_str(&format!(
args.push_str(&formatdoc!(
r#"\
{COMMON_BASH_TRAP_FUNCTIONS}
{remove_vector_shutdown_file_command}
prepare_signal_handlers
containerdebug --output={STACKABLE_LOG_DIR}/containerdebug-state.json --loop &
if [[ -d {LISTENER_VOLUME_DIR} ]]; then
export POD_ADDRESS=$(cat {LISTENER_VOLUME_DIR}/default-address/address)
for i in {LISTENER_VOLUME_DIR}/default-address/ports/*; do
export $(basename $i | tr a-z- A-Z_)_PORT="$(cat $i)"
done
fi
{hadoop_home}/bin/hdfs {role} {upgrade_args} &
wait_for_termination $!
{create_vector_shutdown_file_command}
"#,
{COMMON_BASH_TRAP_FUNCTIONS}
{remove_vector_shutdown_file_command}
prepare_signal_handlers
containerdebug --output={STACKABLE_LOG_DIR}/containerdebug-state.json --loop &
if [[ -d {LISTENER_VOLUME_DIR} ]]; then
export POD_ADDRESS=$(cat {LISTENER_VOLUME_DIR}/default-address/address)
for i in {LISTENER_VOLUME_DIR}/default-address/ports/*; do
export $(basename $i | tr a-z- A-Z_)_PORT="$(cat $i)"
done
fi
{hadoop_home}/bin/hdfs {role} {upgrade_args} &
wait_for_termination $!
{create_vector_shutdown_file_command}
"#,
hadoop_home = Self::HADOOP_HOME,
remove_vector_shutdown_file_command =
remove_vector_shutdown_file_command(STACKABLE_LOG_DIR),
Expand All @@ -664,7 +664,9 @@ wait_for_termination $!
hadoop_home = Self::HADOOP_HOME
));
}
ContainerConfig::FormatNameNodes { .. } => {
ContainerConfig::FormatNameNodes { container_name, .. } => {
args.push_str(&bash_capture_shell_helper(container_name));

if let Some(container_config) = merged_config.as_namenode().map(|node| {
node.logging
.for_container(&NameNodeContainer::FormatNameNodes)
Expand All @@ -690,32 +692,36 @@ wait_for_termination $!
for namenode_id in {pod_names}
do
echo -n "Checking pod $namenode_id... "
{get_service_state_command}

# We only redirect 2 (stderr) to 4 (console).
# We leave 1 (stdout) alone so the $(...) can catch it.
SERVICE_STATE=$({hadoop_home}/bin/hdfs haadmin -getServiceState "$namenode_id" 2>&4 | tail -n1 || true)

if [ "$SERVICE_STATE" == "active" ]
then
ACTIVE_NAMENODE=$namenode_id
ACTIVE_NAMENODE="$namenode_id"
echo "active"
break
else
echo "unknown / unreachable"
fi
echo ""
done

if [ ! -f "{NAMENODE_ROOT_DATA_DIR}/current/VERSION" ]
then
if [ -z ${{ACTIVE_NAMENODE+x}} ]
then
echo "Create pod $POD_NAME as active namenode."
{hadoop_home}/bin/hdfs namenode -format -noninteractive
echo "No active namenode found. Formatting $POD_NAME as active."
exclude_from_capture {hadoop_home}/bin/hdfs namenode -format -noninteractive
else
echo "Create pod $POD_NAME as standby namenode."
{hadoop_home}/bin/hdfs namenode -bootstrapStandby -nonInteractive
echo "Active namenode is $ACTIVE_NAMENODE. Bootstrapping standby."
exclude_from_capture {hadoop_home}/bin/hdfs namenode -bootstrapStandby -nonInteractive
fi
else
cat "{NAMENODE_ROOT_DATA_DIR}/current/VERSION"
echo "Pod $POD_NAME already formatted. Skipping..."
fi
"###,
get_service_state_command = Self::get_namenode_service_state_command(),
hadoop_home = Self::HADOOP_HOME,
pod_names = namenode_podrefs
.iter()
Expand All @@ -724,7 +730,9 @@ wait_for_termination $!
.join(" "),
));
}
ContainerConfig::FormatZooKeeper { .. } => {
ContainerConfig::FormatZooKeeper { container_name, .. } => {
args.push_str(&bash_capture_shell_helper(container_name));

if let Some(container_config) = merged_config.as_namenode().map(|node| {
node.logging
.for_container(&NameNodeContainer::FormatZooKeeper)
Expand All @@ -739,7 +747,7 @@ wait_for_termination $!
echo "Attempt to format ZooKeeper..."
if [[ "0" -eq "$(echo $POD_NAME | sed -e 's/.*-//')" ]] ; then
set +e
{hadoop_home}/bin/hdfs zkfc -formatZK -nonInteractive
exclude_from_capture {hadoop_home}/bin/hdfs zkfc -formatZK -nonInteractive
EXITCODE=$?
set -e
if [[ $EXITCODE -eq 0 ]]; then
Expand All @@ -755,10 +763,12 @@ wait_for_termination $!
echo "ZooKeeper already formatted!"
fi
"###,
hadoop_home = Self::HADOOP_HOME
hadoop_home = Self::HADOOP_HOME,
));
}
ContainerConfig::WaitForNameNodes { .. } => {
ContainerConfig::WaitForNameNodes { container_name, .. } => {
args.push_str(&bash_capture_shell_helper(container_name));

if let Some(container_config) = merged_config.as_datanode().map(|node| {
node.logging
.for_container(&DataNodeContainer::WaitForNameNodes)
Expand All @@ -781,7 +791,11 @@ wait_for_termination $!
for namenode_id in {pod_names}
do
echo -n "Checking pod $namenode_id... "
{get_service_state_command}

# We only redirect 2 (stderr) to 4 (console).
# We leave 1 (stdout) alone so the $(...) can catch it.
SERVICE_STATE=$({hadoop_home}/bin/hdfs haadmin -getServiceState "$namenode_id" 2>&4 | tail -n1 || true)

if [ "$SERVICE_STATE" = "active" ] || [ "$SERVICE_STATE" = "standby" ]
then
echo "$SERVICE_STATE"
Expand All @@ -800,7 +814,7 @@ wait_for_termination $!
sleep 5
done
"###,
get_service_state_command = Self::get_namenode_service_state_command(),
hadoop_home = Self::HADOOP_HOME,
pod_names = namenode_podrefs
.iter()
.map(|pod_ref| pod_ref.pod_name.as_ref())
Expand Down Expand Up @@ -842,14 +856,6 @@ wait_for_termination $!
))
}

fn get_namenode_service_state_command() -> String {
formatdoc!(
r###"
SERVICE_STATE=$({hadoop_home}/bin/hdfs haadmin -getServiceState $namenode_id | tail -n1 || true)"###,
hadoop_home = Self::HADOOP_HOME,
)
}

/// Returns the container env variables.
fn env(
&self,
Expand Down Expand Up @@ -1565,3 +1571,47 @@ impl TryFrom<&str> for ContainerVolumeDirs {
})
}
}

fn bash_capture_shell_helper(container_name: &str) -> String {
let capture_shell_output = product_logging::framework::capture_shell_output(
STACKABLE_LOG_DIR,
container_name,
// we do not access any of the crd config options for this and just log it to file
&AutomaticContainerLogConfig::default(),
);

formatdoc! {
r###"
# Store the original stdout/stderr globally so we can always find our way back
# 3 and 4 are usually safe, but we'll be explicit.
exec 3>&1
exec 4>&2

start_capture() {{
# We redirect 1 and 2 to the background tee processes
{capture_shell_output}
}}

stop_capture() {{
# Restore stdout and stderr from our saved descriptors
exec 1>&3 2>&4
}}

exclude_from_capture() {{
# Temporarily restore original FDs just for the duration of this command
# We use 'local' for the exit code to keep things clean
set +e
"$@" 1>&3 2>&4
local exit_code=$?
set -e

# If the command failed, we manually trigger the exit since we set +e
if [ $exit_code -ne 0 ]; then
exit $exit_code
fi
}}

start_capture
"###
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,48 @@ customConfig:
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-1" &&
.container == "vector"
filteredAutomaticLogConfigNameNode0FormatNameNode:
filteredAutomaticLogConfigNameNode0FormatNameNodeLog4j:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-0" &&
.container == "format-namenodes"
filteredAutomaticLogConfigNameNode1FormatNameNode:
.container == "format-namenodes" &&
.file == "format-namenodes.log4j.xml"
filteredAutomaticLogConfigNameNode0FormatNameNodeStdout:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-0" &&
.container == "format-namenodes" &&
.file == "container.stdout.log"
filteredAutomaticLogConfigNameNode0FormatNameNodeStderr:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-0" &&
.container == "format-namenodes" &&
.file == "container.stderr.log"
filteredAutomaticLogConfigNameNode1FormatNameNodeLog4j:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-1" &&
.container == "format-namenodes"
.container == "format-namenodes" &&
.file == "format-namenodes.log4j.xml"
filteredAutomaticLogConfigNameNode1FormatNameNodeStdout:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-1" &&
.container == "format-namenodes" &&
.file == "container.stdout.log"
filteredAutomaticLogConfigNameNode1FormatNameNodeStderr:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-1" &&
.container == "format-namenodes" &&
.file == "container.stderr.log"
filteredAutomaticLogConfigNameNode0FormatZookeeper:
type: filter
inputs: [validEvents]
Expand Down