Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,13 @@ All notable changes to this project will be documented in this file.
See [objectOverrides concepts page](https://docs.stackable.tech/home/nightly/concepts/overrides/#object-overrides) for details ([#741]).
- Enable the [restart-controller](https://docs.stackable.tech/home/nightly/commons-operator/restarter/), so that the Pods are automatically restarted on config changes ([#743]).

### Fixed

- Previously, some shell output of init-containers was not logged properly and therefore not aggregated, which is fixed now ([#746]).

[#741]: https://github.com/stackabletech/hdfs-operator/pull/741
[#743]: https://github.com/stackabletech/hdfs-operator/pull/743
[#746]: https://github.com/stackabletech/hdfs-operator/pull/746

## [25.11.0] - 2025-11-07

Expand Down
137 changes: 88 additions & 49 deletions rust/operator-binary/src/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ use stackable_operator::{
LoggingError, create_vector_shutdown_file_command, remove_vector_shutdown_file_command,
},
spec::{
ConfigMapLogConfig, ContainerLogConfig, ContainerLogConfigChoice,
CustomContainerLogConfig,
AutomaticContainerLogConfig, ConfigMapLogConfig, ContainerLogConfig,
ContainerLogConfigChoice, CustomContainerLogConfig,
},
},
role_utils::RoleGroupRef,
Expand Down Expand Up @@ -627,22 +627,22 @@ impl ContainerConfig {
&merged_config.hdfs_logging(),
));

args.push_str(&format!(
args.push_str(&formatdoc!(
r#"\
{COMMON_BASH_TRAP_FUNCTIONS}
{remove_vector_shutdown_file_command}
prepare_signal_handlers
containerdebug --output={STACKABLE_LOG_DIR}/containerdebug-state.json --loop &
if [[ -d {LISTENER_VOLUME_DIR} ]]; then
export POD_ADDRESS=$(cat {LISTENER_VOLUME_DIR}/default-address/address)
for i in {LISTENER_VOLUME_DIR}/default-address/ports/*; do
export $(basename $i | tr a-z- A-Z_)_PORT="$(cat $i)"
done
fi
{hadoop_home}/bin/hdfs {role} {upgrade_args} &
wait_for_termination $!
{create_vector_shutdown_file_command}
"#,
{COMMON_BASH_TRAP_FUNCTIONS}
{remove_vector_shutdown_file_command}
prepare_signal_handlers
containerdebug --output={STACKABLE_LOG_DIR}/containerdebug-state.json --loop &
if [[ -d {LISTENER_VOLUME_DIR} ]]; then
export POD_ADDRESS=$(cat {LISTENER_VOLUME_DIR}/default-address/address)
for i in {LISTENER_VOLUME_DIR}/default-address/ports/*; do
export $(basename $i | tr a-z- A-Z_)_PORT="$(cat $i)"
done
fi
{hadoop_home}/bin/hdfs {role} {upgrade_args} &
wait_for_termination $!
{create_vector_shutdown_file_command}
"#,
hadoop_home = Self::HADOOP_HOME,
remove_vector_shutdown_file_command =
remove_vector_shutdown_file_command(STACKABLE_LOG_DIR),
Expand All @@ -664,7 +664,9 @@ wait_for_termination $!
hadoop_home = Self::HADOOP_HOME
));
}
ContainerConfig::FormatNameNodes { .. } => {
ContainerConfig::FormatNameNodes { container_name, .. } => {
args.push_str(&bash_capture_shell_helper(container_name));

if let Some(container_config) = merged_config.as_namenode().map(|node| {
node.logging
.for_container(&NameNodeContainer::FormatNameNodes)
Expand All @@ -690,32 +692,36 @@ wait_for_termination $!
for namenode_id in {pod_names}
do
echo -n "Checking pod $namenode_id... "
{get_service_state_command}

# We only redirect 2 (stderr) to 4 (console).
# We leave 1 (stdout) alone so the $(...) can catch it.
SERVICE_STATE=$({hadoop_home}/bin/hdfs haadmin -getServiceState "$namenode_id" 2>&4 | tail -n1 || true)

if [ "$SERVICE_STATE" == "active" ]
then
ACTIVE_NAMENODE=$namenode_id
ACTIVE_NAMENODE="$namenode_id"
echo "active"
break
else
echo "unknown / unreachable"
fi
echo ""
done

if [ ! -f "{NAMENODE_ROOT_DATA_DIR}/current/VERSION" ]
then
if [ -z ${{ACTIVE_NAMENODE+x}} ]
then
echo "Create pod $POD_NAME as active namenode."
{hadoop_home}/bin/hdfs namenode -format -noninteractive
echo "No active namenode found. Formatting $POD_NAME as active."
exclude_from_capture {hadoop_home}/bin/hdfs namenode -format -noninteractive
else
echo "Create pod $POD_NAME as standby namenode."
{hadoop_home}/bin/hdfs namenode -bootstrapStandby -nonInteractive
echo "Active namenode is $ACTIVE_NAMENODE. Bootstrapping standby."
exclude_from_capture {hadoop_home}/bin/hdfs namenode -bootstrapStandby -nonInteractive
fi
else
cat "{NAMENODE_ROOT_DATA_DIR}/current/VERSION"
echo "Pod $POD_NAME already formatted. Skipping..."
fi
"###,
get_service_state_command = Self::get_namenode_service_state_command(),
hadoop_home = Self::HADOOP_HOME,
pod_names = namenode_podrefs
.iter()
Expand All @@ -724,7 +730,9 @@ wait_for_termination $!
.join(" "),
));
}
ContainerConfig::FormatZooKeeper { .. } => {
ContainerConfig::FormatZooKeeper { container_name, .. } => {
args.push_str(&bash_capture_shell_helper(container_name));

if let Some(container_config) = merged_config.as_namenode().map(|node| {
node.logging
.for_container(&NameNodeContainer::FormatZooKeeper)
Expand All @@ -736,29 +744,27 @@ wait_for_termination $!
}
args.push_str(&formatdoc!(
r###"
echo "Attempt to format ZooKeeper..."
echo "Attempt to format ZooKeeper ZNode for $POD_NAME ..."
if [[ "0" -eq "$(echo $POD_NAME | sed -e 's/.*-//')" ]] ; then
set +e
{hadoop_home}/bin/hdfs zkfc -formatZK -nonInteractive
EXITCODE=$?
set -e
EXITCODE=$(exclude_from_capture {hadoop_home}/bin/hdfs zkfc -formatZK -nonInteractive)
if [[ $EXITCODE -eq 0 ]]; then
echo "Successfully formatted"
echo "Successfully formatted ZooKeeper ZNode."
elif [[ $EXITCODE -eq 2 ]]; then
echo "ZNode already existed, did nothing"
echo "ZNode already exists, nothing to do."
else
echo "Zookeeper format failed with exit code $EXITCODE"
echo "ZooKeeper format ZNode failed with exit code $EXITCODE".
exit $EXITCODE
fi

else
echo "ZooKeeper already formatted!"
echo "ZooKeeper ZNode already formatted!"
fi
"###,
hadoop_home = Self::HADOOP_HOME
hadoop_home = Self::HADOOP_HOME,
));
}
ContainerConfig::WaitForNameNodes { .. } => {
ContainerConfig::WaitForNameNodes { container_name, .. } => {
args.push_str(&bash_capture_shell_helper(container_name));

if let Some(container_config) = merged_config.as_datanode().map(|node| {
node.logging
.for_container(&DataNodeContainer::WaitForNameNodes)
Expand All @@ -781,7 +787,11 @@ wait_for_termination $!
for namenode_id in {pod_names}
do
echo -n "Checking pod $namenode_id... "
{get_service_state_command}

# We only redirect 2 (stderr) to 4 (console).
# We leave 1 (stdout) alone so the $(...) can catch it.
SERVICE_STATE=$({hadoop_home}/bin/hdfs haadmin -getServiceState "$namenode_id" 2>&4 | tail -n1 || true)

if [ "$SERVICE_STATE" = "active" ] || [ "$SERVICE_STATE" = "standby" ]
then
echo "$SERVICE_STATE"
Expand All @@ -800,7 +810,7 @@ wait_for_termination $!
sleep 5
done
"###,
get_service_state_command = Self::get_namenode_service_state_command(),
hadoop_home = Self::HADOOP_HOME,
pod_names = namenode_podrefs
.iter()
.map(|pod_ref| pod_ref.pod_name.as_ref())
Expand Down Expand Up @@ -842,14 +852,6 @@ wait_for_termination $!
))
}

fn get_namenode_service_state_command() -> String {
formatdoc!(
r###"
SERVICE_STATE=$({hadoop_home}/bin/hdfs haadmin -getServiceState $namenode_id | tail -n1 || true)"###,
hadoop_home = Self::HADOOP_HOME,
)
}

/// Returns the container env variables.
fn env(
&self,
Expand Down Expand Up @@ -1565,3 +1567,40 @@ impl TryFrom<&str> for ContainerVolumeDirs {
})
}
}

fn bash_capture_shell_helper(container_name: &str) -> String {
let capture_shell_output = product_logging::framework::capture_shell_output(
STACKABLE_LOG_DIR,
container_name,
// we do not access any of the crd config options for this and just log it to file
&AutomaticContainerLogConfig::default(),
);

formatdoc! {
r###"
# Store the original stdout/stderr globally so we can always find our way back
# 3 and 4 are usually safe, but we'll be explicit.
exec 3>&1
exec 4>&2

start_capture() {{
# We redirect 1 and 2 to the background tee processes
{capture_shell_output}
}}

stop_capture() {{
# Restore stdout and stderr from our saved descriptors
exec 1>&3 2>&4
}}

exclude_from_capture() {{
# Temporarily restore original FDs just for the duration of this command
# We use 'local' for the exit code to keep things clean
"$@" 1>&3 2>&4
echo $?
}}

start_capture
"###
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,24 +60,83 @@ customConfig:
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-1" &&
.container == "vector"
filteredAutomaticLogConfigNameNode0FormatNameNode:
filteredAutomaticLogConfigNameNode0FormatNameNodeLog4j:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-0" &&
.container == "format-namenodes"
filteredAutomaticLogConfigNameNode1FormatNameNode:
.container == "format-namenodes" &&
.file == "format-namenodes.log4j.xml"
filteredAutomaticLogConfigNameNode0FormatNameNodeStdout:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-0" &&
.container == "format-namenodes" &&
.file == "container.stdout.log"
filteredAutomaticLogConfigNameNode0FormatNameNodeStderr:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-0" &&
.container == "format-namenodes" &&
.file == "container.stderr.log"
filteredAutomaticLogConfigNameNode1FormatNameNodeLog4j:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-1" &&
.container == "format-namenodes"
filteredAutomaticLogConfigNameNode0FormatZookeeper:
.container == "format-namenodes" &&
.file == "format-namenodes.log4j.xml"
filteredAutomaticLogConfigNameNode1FormatNameNodeStdout:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-1" &&
.container == "format-namenodes" &&
.file == "container.stdout.log"
filteredAutomaticLogConfigNameNode1FormatNameNodeStderr:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-1" &&
.container == "format-namenodes" &&
.file == "container.stderr.log"
filteredAutomaticLogConfigNameNode0FormatZookeeperLog4j:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-0" &&
.container == "format-zookeeper"
.container == "format-zookeeper" &&
.file == "format-zookeeper.log4j.xml"
filteredAutomaticLogConfigNameNode0FormatZookeeperStdout:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-0" &&
.container == "format-zookeeper" &&
.file == "container.stdout.log"
filteredAutomaticLogConfigNameNode0FormatZookeeperStderr:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-0" &&
.container == "format-zookeeper" &&
.file == "container.stderr.log"
filteredAutomaticLogConfigNameNode1FormatZookeeperStdout:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-1" &&
.container == "format-zookeeper" &&
.file == "container.stdout.log"
filteredAutomaticLogConfigNameNode1FormatZookeeperStderr:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-namenode-default-1" &&
.container == "format-zookeeper" &&
.file == "container.stderr.log"
filteredAutomaticLogConfigDataNode0:
type: filter
inputs: [validEvents]
Expand All @@ -90,12 +149,27 @@ customConfig:
condition: >-
.pod == "test-hdfs-automatic-log-datanode-default-0" &&
.container == "vector"
filteredAutomaticLogConfigDataNode0WaitForNameNodes:
filteredAutomaticLogConfigDataNode0WaitForNameNodesLog4j:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-datanode-default-0" &&
.container == "wait-for-namenodes"
.container == "wait-for-namenodes" &&
.file == "wait-for-namenodes.log4j.xml"
filteredAutomaticLogConfigDataNode0WaitForNameNodesStdout:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-datanode-default-0" &&
.container == "wait-for-namenodes" &&
.file == "container.stdout.log"
filteredAutomaticLogConfigDataNode0WaitForNameNodesStderr:
type: filter
inputs: [validEvents]
condition: >-
.pod == "test-hdfs-automatic-log-datanode-default-0" &&
.container == "wait-for-namenodes" &&
.file == "container.stderr.log"
filteredAutomaticLogConfigJournalNode0:
type: filter
inputs: [validEvents]
Expand Down