Files
troubleshoot/pkg/redact/multi_line.go
Benjamin Yang a9d2180dd6 102 redactor newline corruption clean (#1947)
* fix: prevent redactors from corrupting binary files (#102)

Redactors were adding newlines to files without them, corrupting binary
files during support bundle collection (51 bytes → 53 bytes).

Created LineReader to track original newline state and only restore
newlines when they were present in the original file.

- Added pkg/redact/line_reader.go
- Refactored single_line.go, multi_line.go, literal.go
- Added 48 tests, all passing
- Verified: binary files now preserved byte-for-byte

Fixes #102


* fix: handle empty lines correctly in MultiLineRedactor

- Check line1 == nil instead of len(line1) == 0 for empty file detection
- Fixes edge case where file containing only '\n' would be dropped
- Addresses bugbot finding about empty line handling


* fix: handle empty lines correctly in MultiLineRedactor

- Check line1 != nil instead of len(line1) > 0 in both locations
- Fixes edge case where empty trailing lines would be dropped
- Fix test isolation in literal_test.go (move ResetRedactionList to parent)
- Addresses bugbot findings about empty line handling

* fmt

* chore: update regression baselines from run 20107431959

* adding defense

* fix: propagate non-EOF errors in all early return paths

Ensure non-EOF errors (like buffer overflow) are properly propagated
to caller in both pre-loop early returns. Addresses bugbot finding.

* fix: use unique test names to prevent redaction list pollution

Use t.Name() instead of hardcoded 'test' to ensure each test
has unique redactor name, preventing parallel test interference

---------

Co-authored-by: hedge-sparrow <sparrow@spooky.academy>
2025-12-10 16:55:54 -06:00

265 lines
7.7 KiB
Go

package redact
import (
"bytes"
"io"
"regexp"
)
// lineState represents a line and whether it ended with a newline character.
// This is used by MultiLineRedactor to track newline state for line pairs.
type lineState struct {
content []byte
hadNewline bool
}
type MultiLineRedactor struct {
scan *regexp.Regexp
re1 *regexp.Regexp
re2 *regexp.Regexp
maskText string
filePath string
redactName string
isDefault bool
}
func NewMultiLineRedactor(re1 LineRedactor, re2 string, maskText, path, name string, isDefault bool) (*MultiLineRedactor, error) {
var scanCompiled *regexp.Regexp
compiled1, err := compileRegex(re1.regex)
if err != nil {
return nil, err
}
if re1.scan != "" {
scanCompiled, err = compileRegex(re1.scan)
if err != nil {
return nil, err
}
}
compiled2, err := compileRegex(re2)
if err != nil {
return nil, err
}
return &MultiLineRedactor{scan: scanCompiled, re1: compiled1, re2: compiled2, maskText: maskText, filePath: path, redactName: name, isDefault: isDefault}, nil
}
// Redact processes the input reader in pairs of lines, applying redaction patterns.
// Unlike the previous implementation using bufio.Reader with readLine(), this now
// uses LineReader to preserve the exact newline structure of the input file.
//
// The MultiLineRedactor works by:
// 1. Reading pairs of lines (line1, line2)
// 2. If line1 matches the selector pattern (re1), redact line2 using re2
// 3. Write both lines with their original newline structure preserved
//
// This ensures binary files and text files without trailing newlines are not corrupted.
func (r *MultiLineRedactor) Redact(input io.Reader, path string) io.Reader {
out, writer := io.Pipe()
go func() {
var err error
defer func() {
writer.CloseWithError(err)
}()
tokenizer := GetGlobalTokenizer()
lineReader := NewLineReader(input)
// Try to read first two lines
line1, nl1, line2, nl2, readErr := getNextTwoLines(lineReader, nil)
// Handle case where we can't read 2 lines (empty file or single line)
// Note: We check line1 == nil (not len(line1) == 0) because:
// - nil means truly empty file with no content
// - []byte{} (len==0) means an empty line that had a newline (e.g., "\n")
if readErr != nil && line1 == nil {
// Empty file - nothing to write
// Propagate non-EOF errors (EOF is expected for empty files)
if readErr != io.EOF {
err = readErr
}
return
}
if readErr != nil {
// Only 1 line available (or empty line with newline) - write it and exit
// FIX: This is the bug fix - only add newline if original had one
// Also handles empty lines (line1 == []byte{} with nl1 == true)
err = writeLine(writer, line1, nl1)
if err != nil {
return
}
// Propagate non-EOF errors (EOF is expected for single-line files)
if readErr != io.EOF {
err = readErr
}
return
}
// Process line pairs
flushLastLine := false
lineNum := 1
for readErr == nil {
lineNum++ // the first line that can be redacted is line 2
// Pre-filter: if scan is not nil, check if line1 matches scan by lowercasing it
if r.scan != nil {
lowerLine1 := bytes.ToLower(line1)
if !r.scan.Match(lowerLine1) {
// No match - write line1 and advance
err = writeLine(writer, line1, nl1)
if err != nil {
return
}
line1, nl1, line2, nl2, readErr = getNextTwoLines(lineReader, &lineState{line2, nl2})
flushLastLine = true
continue
}
}
// Check if line1 matches the selector pattern (re1)
if !r.re1.Match(line1) {
// No match - write line1 and advance
err = writeLine(writer, line1, nl1)
if err != nil {
return
}
line1, nl1, line2, nl2, readErr = getNextTwoLines(lineReader, &lineState{line2, nl2})
flushLastLine = true
continue
}
// line1 matched selector - redact line2
flushLastLine = false
var clean []byte
if tokenizer.IsEnabled() {
// Use tokenized replacement for line2 based on line1 context
context := r.redactName
clean = getTokenizedReplacementPatternWithPath(r.re2, line2, context, r.filePath)
} else {
// Use original masking behavior
substStr := []byte(getReplacementPattern(r.re2, r.maskText))
clean = r.re2.ReplaceAll(line2, substStr)
}
// Write line1 (selector line) and line2 (redacted line)
err = writeLine(writer, line1, nl1)
if err != nil {
return
}
err = writeLine(writer, clean, nl2)
if err != nil {
return
}
// Track redaction if content changed
if !bytes.Equal(clean, line2) {
addRedaction(Redaction{
RedactorName: r.redactName,
CharactersRemoved: len(line2) - len(clean),
Line: lineNum,
File: r.filePath,
IsDefaultRedactor: r.isDefault,
})
}
// Get next pair
line1, nl1, line2, nl2, readErr = getNextTwoLines(lineReader, nil)
}
// After loop exits (readErr != nil), check if we have an unwritten line1
// This happens in two cases:
// 1. flushLastLine=true: line1 was advanced but not written (scan/re1 didn't match)
// 2. line1 != nil: we read line1 but couldn't get line2 (unpaired line at end)
// Note: We check line1 != nil (not len(line1) > 0) to handle empty lines ([]byte{})
if flushLastLine || line1 != nil {
err = writeLine(writer, line1, nl1)
if err != nil {
return
}
}
// Propagate non-EOF read errors to the caller
// EOF is expected (end of file) and not an error condition
// Note: readErr is always non-nil here (loop exited), but we only propagate non-EOF errors
if readErr != io.EOF {
err = readErr
}
}()
return out
}
// getNextTwoLines reads the next pair of lines from the LineReader.
// It returns the content and newline state for both lines.
//
// If curLine2 is provided, it's used as line1 (optimization for advancing through file).
// Otherwise, both lines are read fresh from the reader.
//
// Returns:
// - line1, hadNewline1: First line content and newline state
// - line2, hadNewline2: Second line content and newline state
// - err: Error only if we couldn't read line1, or if line2 read failed with non-EOF error
//
// Note: If line2 returns (content, false, io.EOF), we treat this as SUCCESS because
// we got the content. The EOF just means it didn't have a trailing newline.
func getNextTwoLines(lr *LineReader, curLine2 *lineState) (
line1 []byte, hadNewline1 bool,
line2 []byte, hadNewline2 bool,
err error,
) {
if curLine2 == nil {
// Read both lines fresh
line1, hadNewline1, err = lr.ReadLine()
if err != nil {
return
}
line2, hadNewline2, err = lr.ReadLine()
// If we got line2 content but hit EOF, that's OK - it just means no trailing newline
if err == io.EOF && len(line2) > 0 {
err = nil // Clear the error - we successfully read both lines
}
return
}
// Use cached line2 as new line1 (optimization)
line1 = curLine2.content
hadNewline1 = curLine2.hadNewline
// Read new line2
line2, hadNewline2, err = lr.ReadLine()
// If we got line2 content but hit EOF, that's OK - it just means no trailing newline
if err == io.EOF && len(line2) > 0 {
err = nil // Clear the error - we successfully read both lines
}
return
}
// writeLine writes a line to the writer, optionally adding a newline if hadNewline is true.
// This helper reduces code duplication in the Redact function.
func writeLine(w io.Writer, line []byte, hadNewline bool) error {
if err := writeBytes(w, line); err != nil {
return err
}
if hadNewline {
if err := writeBytes(w, NEW_LINE); err != nil {
return err
}
}
return nil
}
// writeBytes writes all byte slices to the writer
// in the order they are passed in the variadic argument
func writeBytes(w io.Writer, bs ...[]byte) error {
for _, b := range bs {
_, err := w.Write(b)
if err != nil {
return err
}
}
return nil
}