Files
Notare/test/xml_sanitizer_test.rb
mathias234 64c8679044
All checks were successful
CI Pipeline / build (push) Successful in 49s
Sanitize invalid XML characters in text content
Strip invalid XML 1.0 control characters (0x00-0x08, 0x0B-0x0C, 0x0E-0x1F)
from text to prevent corrupted docx files that fail to open in LibreOffice.

Fixes SAXParseException 'PCData Invalid Char value' errors.
2026-01-22 09:10:33 +01:00

74 lines
2.1 KiB
Ruby

# frozen_string_literal: true
require "test_helper"
class XmlSanitizerTest < Minitest::Test
def test_removes_null_character
assert_equal "hello", Notare::XmlSanitizer.sanitize("hel\x00lo")
end
def test_removes_control_characters_0x01_to_0x08
input = "a\x01b\x02c\x03d\x04e\x05f\x06g\x07h\x08i"
assert_equal "abcdefghi", Notare::XmlSanitizer.sanitize(input)
end
def test_removes_control_characters_0x0b_and_0x0c
input = "hello\x0Bworld\x0Ctest"
assert_equal "helloworldtest", Notare::XmlSanitizer.sanitize(input)
end
def test_removes_control_characters_0x0e_to_0x1f
input = "a\x0Eb\x0Fc\x10d\x11e\x1Ff"
assert_equal "abcdef", Notare::XmlSanitizer.sanitize(input)
end
def test_preserves_tab_character
input = "hello\tworld"
assert_equal "hello\tworld", Notare::XmlSanitizer.sanitize(input)
end
def test_preserves_newline_character
input = "hello\nworld"
assert_equal "hello\nworld", Notare::XmlSanitizer.sanitize(input)
end
def test_preserves_carriage_return_character
input = "hello\rworld"
assert_equal "hello\rworld", Notare::XmlSanitizer.sanitize(input)
end
def test_preserves_crlf
input = "hello\r\nworld"
assert_equal "hello\r\nworld", Notare::XmlSanitizer.sanitize(input)
end
def test_returns_nil_unchanged
assert_nil Notare::XmlSanitizer.sanitize(nil)
end
def test_returns_non_string_unchanged
assert_equal 123, Notare::XmlSanitizer.sanitize(123)
assert_equal :symbol, Notare::XmlSanitizer.sanitize(:symbol)
end
def test_preserves_unicode_characters
input = "café naïve 日本語 🎉"
assert_equal "café naïve 日本語 🎉", Notare::XmlSanitizer.sanitize(input)
end
def test_preserves_regular_text
input = "Hello, World! This is normal text."
assert_equal input, Notare::XmlSanitizer.sanitize(input)
end
def test_handles_empty_string
assert_equal "", Notare::XmlSanitizer.sanitize("")
end
def test_real_world_case_stx_character
# The actual case from the failed.docx: 0x02 (STX) character
input = "infrastruktur\x02bidrag"
assert_equal "infrastrukturbidrag", Notare::XmlSanitizer.sanitize(input)
end
end