diff --git a/CHANGELOG.md b/CHANGELOG.md index f9608259..cda16407 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,8 +10,13 @@ All notable changes to this project will be documented in this file. See [objectOverrides concepts page](https://docs.stackable.tech/home/nightly/concepts/overrides/#object-overrides) for details ([#741]). - Enable the [restart-controller](https://docs.stackable.tech/home/nightly/commons-operator/restarter/), so that the Pods are automatically restarted on config changes ([#743]). +### Fixed + +- Previosly, some shell output of init-containers was not logged properly and therefore not aggregated, which is fixed now ([#746]). + [#741]: https://github.com/stackabletech/hdfs-operator/pull/741 [#743]: https://github.com/stackabletech/hdfs-operator/pull/743 +[#746]: https://github.com/stackabletech/hdfs-operator/pull/746 ## [25.11.0] - 2025-11-07 diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index b09be984..9ec6de7a 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -44,8 +44,8 @@ use stackable_operator::{ LoggingError, create_vector_shutdown_file_command, remove_vector_shutdown_file_command, }, spec::{ - ConfigMapLogConfig, ContainerLogConfig, ContainerLogConfigChoice, - CustomContainerLogConfig, + AutomaticContainerLogConfig, ConfigMapLogConfig, ContainerLogConfig, + ContainerLogConfigChoice, CustomContainerLogConfig, }, }, role_utils::RoleGroupRef, @@ -627,22 +627,22 @@ impl ContainerConfig { &merged_config.hdfs_logging(), )); - args.push_str(&format!( + args.push_str(&formatdoc!( r#"\ -{COMMON_BASH_TRAP_FUNCTIONS} -{remove_vector_shutdown_file_command} -prepare_signal_handlers -containerdebug --output={STACKABLE_LOG_DIR}/containerdebug-state.json --loop & -if [[ -d {LISTENER_VOLUME_DIR} ]]; then - export POD_ADDRESS=$(cat {LISTENER_VOLUME_DIR}/default-address/address) - for i in {LISTENER_VOLUME_DIR}/default-address/ports/*; do - export $(basename $i | tr a-z- A-Z_)_PORT="$(cat $i)" - done -fi -{hadoop_home}/bin/hdfs {role} {upgrade_args} & -wait_for_termination $! -{create_vector_shutdown_file_command} -"#, + {COMMON_BASH_TRAP_FUNCTIONS} + {remove_vector_shutdown_file_command} + prepare_signal_handlers + containerdebug --output={STACKABLE_LOG_DIR}/containerdebug-state.json --loop & + if [[ -d {LISTENER_VOLUME_DIR} ]]; then + export POD_ADDRESS=$(cat {LISTENER_VOLUME_DIR}/default-address/address) + for i in {LISTENER_VOLUME_DIR}/default-address/ports/*; do + export $(basename $i | tr a-z- A-Z_)_PORT="$(cat $i)" + done + fi + {hadoop_home}/bin/hdfs {role} {upgrade_args} & + wait_for_termination $! + {create_vector_shutdown_file_command} + "#, hadoop_home = Self::HADOOP_HOME, remove_vector_shutdown_file_command = remove_vector_shutdown_file_command(STACKABLE_LOG_DIR), @@ -664,7 +664,9 @@ wait_for_termination $! hadoop_home = Self::HADOOP_HOME )); } - ContainerConfig::FormatNameNodes { .. } => { + ContainerConfig::FormatNameNodes { container_name, .. } => { + args.push_str(&bash_capture_shell_helper(container_name)); + if let Some(container_config) = merged_config.as_namenode().map(|node| { node.logging .for_container(&NameNodeContainer::FormatNameNodes) @@ -690,32 +692,36 @@ wait_for_termination $! for namenode_id in {pod_names} do echo -n "Checking pod $namenode_id... " - {get_service_state_command} + + # We only redirect 2 (stderr) to 4 (console). + # We leave 1 (stdout) alone so the $(...) can catch it. + SERVICE_STATE=$({hadoop_home}/bin/hdfs haadmin -getServiceState "$namenode_id" 2>&4 | tail -n1 || true) + if [ "$SERVICE_STATE" == "active" ] then - ACTIVE_NAMENODE=$namenode_id + ACTIVE_NAMENODE="$namenode_id" echo "active" break + else + echo "unknown / unreachable" fi - echo "" done if [ ! -f "{NAMENODE_ROOT_DATA_DIR}/current/VERSION" ] then if [ -z ${{ACTIVE_NAMENODE+x}} ] then - echo "Create pod $POD_NAME as active namenode." - {hadoop_home}/bin/hdfs namenode -format -noninteractive + echo "No active namenode found. Formatting $POD_NAME as active." + exclude_from_capture {hadoop_home}/bin/hdfs namenode -format -noninteractive else - echo "Create pod $POD_NAME as standby namenode." - {hadoop_home}/bin/hdfs namenode -bootstrapStandby -nonInteractive + echo "Active namenode is $ACTIVE_NAMENODE. Bootstrapping standby." + exclude_from_capture {hadoop_home}/bin/hdfs namenode -bootstrapStandby -nonInteractive fi else cat "{NAMENODE_ROOT_DATA_DIR}/current/VERSION" echo "Pod $POD_NAME already formatted. Skipping..." fi "###, - get_service_state_command = Self::get_namenode_service_state_command(), hadoop_home = Self::HADOOP_HOME, pod_names = namenode_podrefs .iter() @@ -724,7 +730,9 @@ wait_for_termination $! .join(" "), )); } - ContainerConfig::FormatZooKeeper { .. } => { + ContainerConfig::FormatZooKeeper { container_name, .. } => { + args.push_str(&bash_capture_shell_helper(container_name)); + if let Some(container_config) = merged_config.as_namenode().map(|node| { node.logging .for_container(&NameNodeContainer::FormatZooKeeper) @@ -739,7 +747,7 @@ wait_for_termination $! echo "Attempt to format ZooKeeper..." if [[ "0" -eq "$(echo $POD_NAME | sed -e 's/.*-//')" ]] ; then set +e - {hadoop_home}/bin/hdfs zkfc -formatZK -nonInteractive + exclude_from_capture {hadoop_home}/bin/hdfs zkfc -formatZK -nonInteractive EXITCODE=$? set -e if [[ $EXITCODE -eq 0 ]]; then @@ -755,10 +763,12 @@ wait_for_termination $! echo "ZooKeeper already formatted!" fi "###, - hadoop_home = Self::HADOOP_HOME + hadoop_home = Self::HADOOP_HOME, )); } - ContainerConfig::WaitForNameNodes { .. } => { + ContainerConfig::WaitForNameNodes { container_name, .. } => { + args.push_str(&bash_capture_shell_helper(container_name)); + if let Some(container_config) = merged_config.as_datanode().map(|node| { node.logging .for_container(&DataNodeContainer::WaitForNameNodes) @@ -781,7 +791,11 @@ wait_for_termination $! for namenode_id in {pod_names} do echo -n "Checking pod $namenode_id... " - {get_service_state_command} + + # We only redirect 2 (stderr) to 4 (console). + # We leave 1 (stdout) alone so the $(...) can catch it. + SERVICE_STATE=$({hadoop_home}/bin/hdfs haadmin -getServiceState "$namenode_id" 2>&4 | tail -n1 || true) + if [ "$SERVICE_STATE" = "active" ] || [ "$SERVICE_STATE" = "standby" ] then echo "$SERVICE_STATE" @@ -800,7 +814,7 @@ wait_for_termination $! sleep 5 done "###, - get_service_state_command = Self::get_namenode_service_state_command(), + hadoop_home = Self::HADOOP_HOME, pod_names = namenode_podrefs .iter() .map(|pod_ref| pod_ref.pod_name.as_ref()) @@ -842,14 +856,6 @@ wait_for_termination $! )) } - fn get_namenode_service_state_command() -> String { - formatdoc!( - r###" - SERVICE_STATE=$({hadoop_home}/bin/hdfs haadmin -getServiceState $namenode_id | tail -n1 || true)"###, - hadoop_home = Self::HADOOP_HOME, - ) - } - /// Returns the container env variables. fn env( &self, @@ -1565,3 +1571,47 @@ impl TryFrom<&str> for ContainerVolumeDirs { }) } } + +fn bash_capture_shell_helper(container_name: &str) -> String { + let capture_shell_output = product_logging::framework::capture_shell_output( + STACKABLE_LOG_DIR, + container_name, + // we do not access any of the crd config options for this and just log it to file + &AutomaticContainerLogConfig::default(), + ); + + formatdoc! { + r###" + # Store the original stdout/stderr globally so we can always find our way back + # 3 and 4 are usually safe, but we'll be explicit. + exec 3>&1 + exec 4>&2 + + start_capture() {{ + # We redirect 1 and 2 to the background tee processes + {capture_shell_output} + }} + + stop_capture() {{ + # Restore stdout and stderr from our saved descriptors + exec 1>&3 2>&4 + }} + + exclude_from_capture() {{ + # Temporarily restore original FDs just for the duration of this command + # We use 'local' for the exit code to keep things clean + set +e + "$@" 1>&3 2>&4 + local exit_code=$? + set -e + + # If the command failed, we manually trigger the exit since we set +e + if [ $exit_code -ne 0 ]; then + exit $exit_code + fi + }} + + start_capture + "### + } +} diff --git a/tests/templates/kuttl/logging/hdfs-vector-aggregator-values.yaml.j2 b/tests/templates/kuttl/logging/hdfs-vector-aggregator-values.yaml.j2 index 82267273..4cb67af6 100644 --- a/tests/templates/kuttl/logging/hdfs-vector-aggregator-values.yaml.j2 +++ b/tests/templates/kuttl/logging/hdfs-vector-aggregator-values.yaml.j2 @@ -60,18 +60,48 @@ customConfig: condition: >- .pod == "test-hdfs-automatic-log-namenode-default-1" && .container == "vector" - filteredAutomaticLogConfigNameNode0FormatNameNode: + filteredAutomaticLogConfigNameNode0FormatNameNodeLog4j: type: filter inputs: [validEvents] condition: >- .pod == "test-hdfs-automatic-log-namenode-default-0" && - .container == "format-namenodes" - filteredAutomaticLogConfigNameNode1FormatNameNode: + .container == "format-namenodes" && + .file == "format-namenodes.log4j.xml" + filteredAutomaticLogConfigNameNode0FormatNameNodeStdout: + type: filter + inputs: [validEvents] + condition: >- + .pod == "test-hdfs-automatic-log-namenode-default-0" && + .container == "format-namenodes" && + .file == "container.stdout.log" + filteredAutomaticLogConfigNameNode0FormatNameNodeStderr: + type: filter + inputs: [validEvents] + condition: >- + .pod == "test-hdfs-automatic-log-namenode-default-0" && + .container == "format-namenodes" && + .file == "container.stderr.log" + filteredAutomaticLogConfigNameNode1FormatNameNodeLog4j: type: filter inputs: [validEvents] condition: >- .pod == "test-hdfs-automatic-log-namenode-default-1" && - .container == "format-namenodes" + .container == "format-namenodes" && + .file == "format-namenodes.log4j.xml" + filteredAutomaticLogConfigNameNode1FormatNameNodeStdout: + type: filter + inputs: [validEvents] + condition: >- + .pod == "test-hdfs-automatic-log-namenode-default-1" && + .container == "format-namenodes" && + .file == "container.stdout.log" + filteredAutomaticLogConfigNameNode1FormatNameNodeStderr: + type: filter + inputs: [validEvents] + condition: >- + .pod == "test-hdfs-automatic-log-namenode-default-1" && + .container == "format-namenodes" && + .file == "container.stderr.log" filteredAutomaticLogConfigNameNode0FormatZookeeper: type: filter inputs: [validEvents]