Improved error handling for support bundles and redactors for windows (#1878)

* improved error handling and window locking

* Delete all-windows-collectors.yaml

* addressing bugbot concerns

* Update host_tcpportstatus.go

* Update redact.go
This commit is contained in:
Noah Campbell
2025-10-06 14:32:29 -07:00
committed by GitHub
parent b4e3ed278a
commit 12f8bc3daa
13 changed files with 355 additions and 55 deletions

View File

@@ -1,7 +1,7 @@
apiVersion: troubleshoot.sh/v1beta2
kind: SupportBundle
metadata:
name: all-collectors-test
name: all-host-collectors
spec:
hostCollectors:
# System Info Collectors
@@ -46,25 +46,51 @@ spec:
fileSize: 10Mi
operationSizeBytes: 2300
# Certificate
# Certificate Collectors
- certificate:
collectorName: test-cert
certificatePath: /etc/ssl/certs/ca-certificates.crt
- certificatesCollection:
collectorName: certs-collection
paths:
- /etc/ssl/certs
# Network Tests
- tcpPortStatus:
collectorName: ssh-port
port: 22
- udpPortStatus:
collectorName: dns-port
port: 53
- tcpConnect:
collectorName: localhost-ssh
address: 127.0.0.1:22
- tcpLoadBalancer:
collectorName: lb-test
address: 127.0.0.1
port: 80
- httpLoadBalancer:
collectorName: http-lb-test
address: 127.0.0.1
port: 80
path: /healthz
- http:
collectorName: google
get:
url: https://www.google.com
- dns:
collectorName: dns-google
hostname: google.com
hostnames:
- google.com
- subnetAvailable:
collectorName: subnet-check
CIDRRangeAlloc: 10.0.0.0/16
desiredCIDR: 24
- networkNamespaceConnectivity:
collectorName: netns-connectivity
fromCIDR: 10.0.0.0/8
toCIDR: 192.168.0.0/16
port: 80
# Custom Commands
- run:

View File

@@ -0,0 +1,170 @@
apiVersion: troubleshoot.sh/v1beta2
kind: SupportBundle
metadata:
name: all-kubernetes-collectors
spec:
collectors:
# Cluster Info Collectors (2)
- clusterInfo: {}
- clusterResources: {}
# Metrics Collectors (2)
- customMetrics:
collectorName: custom-metrics
metricRequests:
- resourceMetricName: example-metric
- nodeMetrics: {}
# ConfigMap and Secret Collectors (2)
- configMap:
collectorName: example-configmap
name: example-configmap
namespace: default
includeValue: false
- secret:
collectorName: example-secret
name: example-secret
namespace: default
includeValue: false
# Logs Collector (1)
- logs:
collectorName: example-logs
selector:
- app=example
namespace: default
limits:
maxAge: 720h
maxLines: 10000
# Pod Execution Collectors (4)
- run:
collectorName: run-example
name: run-example
namespace: default
image: busybox:latest
command: ["echo"]
args: ["hello from run"]
- runPod:
collectorName: run-pod-example
name: run-pod-example
namespace: default
podSpec:
containers:
- name: example
image: busybox:latest
command: ["echo", "hello from runPod"]
- runDaemonSet:
collectorName: run-daemonset-example
name: run-daemonset-example
namespace: default
podSpec:
containers:
- name: example
image: busybox:latest
command: ["echo", "hello from runDaemonSet"]
- exec:
collectorName: exec-example
name: exec-example
selector:
- app=example
namespace: default
command: ["echo"]
args: ["hello from exec"]
# Data Collector (1)
- data:
collectorName: static-data
name: static-data.txt
data: "This is static data"
# Copy Collectors (2)
- copy:
collectorName: copy-example
selector:
- app=example
namespace: default
containerPath: /tmp
- copyFromHost:
collectorName: copy-from-host-example
name: copy-from-host-example
namespace: default
image: busybox:latest
hostPath: /tmp/example
# HTTP Collector (1)
- http:
collectorName: http-get-example
get:
url: https://www.google.com
insecureSkipVerify: false
# Database Collectors (4)
- postgres:
collectorName: postgres-example
uri: postgresql://user:password@localhost:5432/dbname
- mysql:
collectorName: mysql-example
uri: user:password@tcp(localhost:3306)/dbname
- mssql:
collectorName: mssql-example
uri: sqlserver://user:password@localhost:1433?database=dbname
- redis:
collectorName: redis-example
uri: redis://localhost:6379
# Storage and System Collectors (3)
- collectd:
collectorName: collectd-example
namespace: default
image: busybox:latest
hostPath: /var/lib/collectd
- ceph:
collectorName: ceph-example
namespace: rook-ceph
- longhorn:
collectorName: longhorn-example
namespace: longhorn-system
# Registry and Image Collector (1)
- registryImages:
collectorName: registry-images-example
namespace: default
images:
- busybox:latest
# Sysctl Collector (1)
- sysctl:
collectorName: sysctl-example
name: sysctl-example
namespace: default
image: busybox:latest
# Certificate Collector (1)
- certificates:
collectorName: certificates-example
secrets:
- name: tls-secret
namespaces:
- default
# Application-Specific Collectors (3)
- helm:
collectorName: helm-example
namespace: default
releaseName: example-release
collectValues: false
- goldpinger:
collectorName: goldpinger-example
namespace: default
- sonobuoy:
collectorName: sonobuoy-example
namespace: sonobuoy
# DNS and Network Collectors (2)
- dns:
collectorName: dns-example
timeout: 10s
- etcd:
collectorName: etcd-example
image: quay.io/coreos/etcd:latest

View File

@@ -32,9 +32,11 @@ func (c *CollectHostCertificate) IsExcluded() (bool, error) {
func (c *CollectHostCertificate) Collect(progressChan chan<- interface{}) (map[string][]byte, error) {
var result = KeyPairValid
var collectorErr error
_, err := tls.LoadX509KeyPair(c.hostCollector.CertificatePath, c.hostCollector.KeyPath)
if err != nil {
collectorErr = err
if strings.Contains(err.Error(), "no such file") {
result = KeyPairMissing
} else if strings.Contains(err.Error(), "PEM inputs may have been switched") {
@@ -67,7 +69,7 @@ func (c *CollectHostCertificate) Collect(progressChan chan<- interface{}) (map[s
return map[string][]byte{
name: b,
}, nil
}, collectorErr
}
func isEncryptedKey(filename string) (bool, error) {

View File

@@ -48,11 +48,11 @@ func (c *CollectHostCopy) Collect(progressChan chan<- interface{}) (map[string][
klog.Errorf("Failed to copy files from %q to %q: %v", c.hostCollector.Path, "<bundle>/"+bundleRelPath, err)
fileName := fmt.Sprintf("%s/errors.json", c.relBundlePath(bundlePathDest))
output := NewResult()
err := output.SaveResult(c.BundlePath, fileName, marshalErrors([]string{err.Error()}))
if err != nil {
return nil, err
saveErr := output.SaveResult(c.BundlePath, fileName, marshalErrors([]string{err.Error()}))
if saveErr != nil {
return nil, saveErr
}
return output, nil
return output, err
}
return result, nil

View File

@@ -80,11 +80,15 @@ func (c *CollectHostHTTPLoadBalancer) Collect(progressChan chan<- interface{}) (
}()
var networkStatus NetworkStatus
var errorMessage string
var collectorErr error
stopAfter := time.Now().Add(timeout)
for {
if len(listenErr) > 0 {
err := <-listenErr
errorMessage = err.Error()
collectorErr = errors.Wrap(err, "failed to listen on HTTP port")
if strings.Contains(err.Error(), "address already in use") {
networkStatus = NetworkStatusAddressInUse
break
@@ -113,7 +117,8 @@ func (c *CollectHostHTTPLoadBalancer) Collect(progressChan chan<- interface{}) (
}
result := NetworkStatusResult{
Status: networkStatus,
Status: networkStatus,
Message: errorMessage,
}
b, err := json.Marshal(result)
@@ -132,7 +137,7 @@ func (c *CollectHostHTTPLoadBalancer) Collect(progressChan chan<- interface{}) (
return map[string][]byte{
name: b,
}, nil
}, collectorErr
}
func attemptPOST(address string, request []byte, response []byte) NetworkStatus {

View File

@@ -2,6 +2,7 @@ package collect
import (
"bytes"
"fmt"
"net"
"regexp"
"strconv"
@@ -70,19 +71,20 @@ func isValidLoadBalancerAddress(address string) bool {
return len(errs) == 0
}
func checkTCPConnection(progressChan chan<- interface{}, listenAddress string, dialAddress string, timeout time.Duration) (NetworkStatus, error) {
func checkTCPConnection(progressChan chan<- interface{}, listenAddress string, dialAddress string, timeout time.Duration) (NetworkStatus, string, error) {
if !isValidLoadBalancerAddress(dialAddress) {
return NetworkStatusInvalidAddress, errors.Errorf("Invalid Load Balancer Address: %v", dialAddress)
errMsg := fmt.Sprintf("Invalid Load Balancer Address: %v", dialAddress)
return NetworkStatusInvalidAddress, errMsg, errors.New(errMsg)
}
lstn, err := net.Listen("tcp", listenAddress)
if err != nil {
if strings.Contains(err.Error(), "address already in use") {
return NetworkStatusAddressInUse, nil
return NetworkStatusAddressInUse, err.Error(), errors.Wrap(err, "failed to create listener")
}
return NetworkStatusErrorOther, errors.Wrap(err, "failed to create listener")
return NetworkStatusErrorOther, err.Error(), errors.Wrap(err, "failed to create listener")
}
defer lstn.Close()
@@ -110,7 +112,8 @@ func checkTCPConnection(progressChan chan<- interface{}, listenAddress string, d
if time.Now().After(stopAfter) {
debug.Printf("Timeout")
return NetworkStatusConnectionTimeout, nil
errMsg := "connection timeout"
return NetworkStatusConnectionTimeout, errMsg, errors.New(errMsg)
}
conn, err := net.DialTimeout("tcp", dialAddress, 50*time.Millisecond)
@@ -124,13 +127,13 @@ func checkTCPConnection(progressChan chan<- interface{}, listenAddress string, d
continue
}
if strings.Contains(err.Error(), "connection refused") {
return NetworkStatusConnectionRefused, nil
return NetworkStatusConnectionRefused, err.Error(), errors.Wrap(err, "failed to dial")
}
return NetworkStatusErrorOther, errors.Wrap(err, "failed to dial")
return NetworkStatusErrorOther, err.Error(), errors.Wrap(err, "failed to dial")
}
if verifyConnectionToServer(conn, requestToken, responseToken) {
return NetworkStatusConnected, nil
return NetworkStatusConnected, "", nil
}
progressChan <- errors.New("failed to verify connection to server")

View File

@@ -37,8 +37,10 @@ func (c *CollectHostTCPConnect) Collect(progressChan chan<- interface{}) (map[st
}
}
status, message := attemptConnect(address, timeout)
result := NetworkStatusResult{
Status: attemptConnect(address, timeout),
Status: status,
Message: message,
}
b, err := json.Marshal(result)
@@ -55,25 +57,31 @@ func (c *CollectHostTCPConnect) Collect(progressChan chan<- interface{}) (map[st
output := NewResult()
output.SaveResult(c.BundlePath, name, bytes.NewBuffer(b))
var collectorErr error
if status != NetworkStatusConnected && message != "" {
collectorErr = errors.Errorf("failed to connect to %s: %s", address, message)
}
return map[string][]byte{
name: b,
}, nil
}, collectorErr
}
func attemptConnect(address string, timeout time.Duration) NetworkStatus {
func attemptConnect(address string, timeout time.Duration) (NetworkStatus, string) {
conn, err := net.DialTimeout("tcp", address, timeout)
if err != nil {
errorMessage := err.Error()
if strings.Contains(err.Error(), "i/o timeout") {
return NetworkStatusConnectionTimeout
return NetworkStatusConnectionTimeout, errorMessage
}
if strings.Contains(err.Error(), "connection refused") {
return NetworkStatusConnectionRefused
return NetworkStatusConnectionRefused, errorMessage
}
return NetworkStatusErrorOther
return NetworkStatusErrorOther, errorMessage
}
conn.Close()
return NetworkStatusConnected
return NetworkStatusConnected, ""
}
func (c *CollectHostTCPConnect) RemoteCollect(progressChan chan<- interface{}) (map[string][]byte, error) {

View File

@@ -44,11 +44,11 @@ func (c *CollectHostTCPLoadBalancer) Collect(progressChan chan<- interface{}) (m
return nil, errors.Wrap(err, "failed to parse duration")
}
}
networkStatus, err := checkTCPConnection(progressChan, listenAddress, dialAddress, timeout)
networkStatus, errorMessage, err := checkTCPConnection(progressChan, listenAddress, dialAddress, timeout)
if err != nil {
result := NetworkStatusResult{
Status: networkStatus,
Message: err.Error(),
Message: errorMessage,
}
b, err := json.Marshal(result)
if err != nil {
@@ -62,7 +62,8 @@ func (c *CollectHostTCPLoadBalancer) Collect(progressChan chan<- interface{}) (m
}, err
}
result := NetworkStatusResult{
Status: networkStatus,
Status: networkStatus,
Message: errorMessage,
}
b, err := json.Marshal(result)

View File

@@ -50,13 +50,11 @@ func (c *CollectHostTCPPortStatus) Collect(progressChan chan<- interface{}) (map
dialAddress = fmt.Sprintf("%s:%d", ip, c.hostCollector.Port)
}
networkStatus, err := checkTCPConnection(progressChan, listenAddress, dialAddress, 10*time.Second)
if err != nil {
return nil, err
}
networkStatus, errorMessage, checkErr := checkTCPConnection(progressChan, listenAddress, dialAddress, 10*time.Second)
result := NetworkStatusResult{
Status: networkStatus,
Status: networkStatus,
Message: errorMessage,
}
b, err := json.Marshal(result)
if err != nil {
@@ -74,7 +72,7 @@ func (c *CollectHostTCPPortStatus) Collect(progressChan chan<- interface{}) (map
return map[string][]byte{
name: b,
}, nil
}, checkErr
}
func getIPv4FromInterface(iface *net.Interface) (net.IP, error) {

View File

@@ -43,8 +43,12 @@ func (c *CollectHostUDPPortStatus) Collect(progressChan chan<- interface{}) (map
}
var networkStatus NetworkStatus
var errorMessage string
var listenErr error
lstn, err := net.ListenUDP("udp", &listenAddress)
if err != nil {
errorMessage = err.Error()
listenErr = errors.Wrap(err, "failed to listen on UDP port")
if strings.Contains(err.Error(), "address already in use") {
networkStatus = NetworkStatusAddressInUse
} else {
@@ -56,7 +60,8 @@ func (c *CollectHostUDPPortStatus) Collect(progressChan chan<- interface{}) (map
}
result := NetworkStatusResult{
Status: networkStatus,
Status: networkStatus,
Message: errorMessage,
}
b, err := json.Marshal(result)
if err != nil {
@@ -74,7 +79,7 @@ func (c *CollectHostUDPPortStatus) Collect(progressChan chan<- interface{}) (map
return map[string][]byte{
name: b,
}, nil
}, listenErr
}
func (c *CollectHostUDPPortStatus) RemoteCollect(progressChan chan<- interface{}) (map[string][]byte, error) {

View File

@@ -1,6 +1,7 @@
package collect
import (
"encoding/json"
"net"
"os"
"strconv"
@@ -30,9 +31,11 @@ func TestCollectHostUDPPortStatus_Collect(t *testing.T) {
}
tests := []struct {
name string
getPort func(t *testing.T) (port int, closeFn func() error)
want map[string][]byte
name string
getPort func(t *testing.T) (port int, closeFn func() error)
wantStatus string
wantMsgContain string
wantErr bool
}{
{
name: "connected",
@@ -42,9 +45,9 @@ func TestCollectHostUDPPortStatus_Collect(t *testing.T) {
conn.Close()
return port, nil
},
want: map[string][]byte{
"host-collectors/udpPortStatus/udpPortStatus.json": []byte(`{"status":"connected","message":""}`),
},
wantStatus: "connected",
wantMsgContain: "",
wantErr: false,
},
{
name: "address-in-use",
@@ -53,9 +56,9 @@ func TestCollectHostUDPPortStatus_Collect(t *testing.T) {
require.NoError(t, err)
return port, conn.Close
},
want: map[string][]byte{
"host-collectors/udpPortStatus/udpPortStatus.json": []byte(`{"status":"address-in-use","message":""}`),
},
wantStatus: "address-in-use",
wantMsgContain: "address already in use",
wantErr: true,
},
}
for _, tt := range tests {
@@ -82,9 +85,23 @@ func TestCollectHostUDPPortStatus_Collect(t *testing.T) {
}
}()
got, err := c.Collect(progresChan)
if tt.wantErr {
require.Error(t, err)
} else {
require.NoError(t, err)
}
require.Len(t, got, 1)
var result NetworkStatusResult
err = json.Unmarshal(got["host-collectors/udpPortStatus/udpPortStatus.json"], &result)
require.NoError(t, err)
assert.Equal(t, tt.want, got)
assert.Equal(t, tt.wantStatus, string(result.Status))
if tt.wantMsgContain != "" {
assert.Contains(t, result.Message, tt.wantMsgContain)
} else {
assert.Empty(t, result.Message)
}
})
}
}

View File

@@ -93,6 +93,17 @@ func RedactResult(bundlePath string, input CollectorResult, additionalRedactors
readerCloseFn = func() error { return nil } // No-op for in-memory data
}
// Ensure the reader is eventually closed even on error paths.
// This defer is guarded by setting readerCloseFn to nil after any explicit close
// to prevent double-closing (notably when we must close before rewriting files on Windows).
defer func() {
if readerCloseFn != nil {
if err := readerCloseFn(); err != nil {
klog.Warningf("Failed to close reader for %s: %v", file, err)
}
}
}()
// If the file is .tar, .tgz or .tar.gz, it must not be redacted. Instead it is
// decompressed and each file inside the tar redacted and compressed back into the archive.
if filepath.Ext(file) == ".tar" || filepath.Ext(file) == ".tgz" || strings.HasSuffix(file, ".tar.gz") {
@@ -109,12 +120,13 @@ func RedactResult(bundlePath string, input CollectorResult, additionalRedactors
return
}
// Ensure the reader is closed after processing
// Close the reader before we write back to the same file path (Windows safety)
if err := readerCloseFn(); err != nil {
klog.Warningf("Failed to close reader for %s: %v", file, err)
errorCh <- errors.Wrap(err, "failed to close reader")
return
}
readerCloseFn = nil
err = RedactResult(tmpDir, subResult, additionalRedactors)
if err != nil {
@@ -141,7 +153,25 @@ func RedactResult(bundlePath string, input CollectorResult, additionalRedactors
return
}
err = input.ReplaceResult(bundlePath, file, redacted)
// Fully consume the redacted reader into a buffer while the source file is still open
// This is required on Windows where we can't delete a file that's open
var redactedBuf bytes.Buffer
_, err = io.Copy(&redactedBuf, redacted)
if err != nil {
errorCh <- errors.Wrap(err, "failed to read redacted data")
return
}
// Close the reader now that we've consumed all the data (Windows safety)
if err := readerCloseFn(); err != nil {
klog.Warningf("Failed to close reader for %s: %v", file, err)
errorCh <- errors.Wrap(err, "failed to close reader")
return
}
readerCloseFn = nil
// Now replace the file with the buffered redacted content
err = input.ReplaceResult(bundlePath, file, &redactedBuf)
if err != nil {
errorCh <- errors.Wrap(err, "failed to create redacted result")
return

View File

@@ -188,11 +188,33 @@ func (r CollectorResult) ReplaceResult(bundlePath string, relativePath string, r
return nil
}
targetPath := filepath.Join(bundlePath, relativePath)
targetDir := filepath.Dir(targetPath)
// Ensure the target directory exists
if err := os.MkdirAll(targetDir, 0755); err != nil {
return errors.Wrap(err, "failed to create target directory")
}
// Create a temporary file in the same directory as the target file to prevent cross-device issues
tmpFile, err := os.CreateTemp("", "replace-")
tmpFile, err := os.CreateTemp(targetDir, "replace-*.tmp")
if err != nil {
return errors.Wrap(err, "failed to create temp file")
}
tmpFileName := tmpFile.Name()
// Ensure cleanup of temp file on error
cleanupNeeded := true
defer func() {
if tmpFile != nil {
// Best-effort close in defer; ignore close errors here
_ = tmpFile.Close()
}
if cleanupNeeded {
// Best-effort remove of temp file if we didn't successfully rename it
_ = os.Remove(tmpFileName)
}
}()
// Write data to the temporary file
_, err = io.Copy(tmpFile, reader)
@@ -201,13 +223,24 @@ func (r CollectorResult) ReplaceResult(bundlePath string, relativePath string, r
}
// Close the file to ensure all data is written
tmpFile.Close()
if err = tmpFile.Close(); err != nil {
return errors.Wrap(err, "failed to close tmp file")
}
tmpFile = nil // Prevent defer from closing again
// This rename should always be in /tmp, so no cross-partition copying will happen
err = os.Rename(tmpFile.Name(), filepath.Join(bundlePath, relativePath))
// On Windows, we need to remove the target file first before renaming
// On Unix, os.Rename will atomically replace the file
if err := os.Remove(targetPath); err != nil && !os.IsNotExist(err) {
return errors.Wrap(err, "failed to remove existing file")
}
// Rename temp file to target
err = os.Rename(tmpFileName, targetPath)
if err != nil {
return errors.Wrap(err, "failed to rename tmp file")
}
// If rename succeeded, no need to clean up the temp file path
cleanupNeeded = false
return nil
}
@@ -318,7 +351,8 @@ func (r CollectorResult) ArchiveBundle(bundlePath string, outputFilename string)
return errors.Wrap(err, "failed to create relative file name")
}
// Use the relative path of the file so as to retain directory hierachy
hdr.Name = nameInArchive
// Convert to forward slashes for tar archive (required for cross-platform compatibility)
hdr.Name = filepath.ToSlash(nameInArchive)
if fileMode.Type() == os.ModeSymlink {
linkTarget, err := os.Readlink(filename)
@@ -339,7 +373,8 @@ func (r CollectorResult) ArchiveBundle(bundlePath string, outputFilename string)
return errors.Wrap(err, "failed to create relative path of symlink target file")
}
hdr.Linkname = relLinkPath
// Convert to forward slashes for tar archive (required for cross-platform compatibility)
hdr.Linkname = filepath.ToSlash(relLinkPath)
}
err = tarWriter.WriteHeader(hdr)
@@ -347,7 +382,7 @@ func (r CollectorResult) ArchiveBundle(bundlePath string, outputFilename string)
return errors.Wrap(err, "failed to write tar header")
}
func() error {
err = func() error {
if fileMode.Type() == os.ModeSymlink {
// Don't copy the symlink, just write the header which
// will create a symlink in the tarball