Sanitize invalid XML characters in text content
All checks were successful
CI Pipeline / build (push) Successful in 49s
All checks were successful
CI Pipeline / build (push) Successful in 49s
Strip invalid XML 1.0 control characters (0x00-0x08, 0x0B-0x0C, 0x0E-0x1F) from text to prevent corrupted docx files that fail to open in LibreOffice. Fixes SAXParseException 'PCData Invalid Char value' errors.
This commit is contained in:
73
test/xml_sanitizer_test.rb
Normal file
73
test/xml_sanitizer_test.rb
Normal file
@@ -0,0 +1,73 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
require "test_helper"
|
||||
|
||||
class XmlSanitizerTest < Minitest::Test
|
||||
def test_removes_null_character
|
||||
assert_equal "hello", Notare::XmlSanitizer.sanitize("hel\x00lo")
|
||||
end
|
||||
|
||||
def test_removes_control_characters_0x01_to_0x08
|
||||
input = "a\x01b\x02c\x03d\x04e\x05f\x06g\x07h\x08i"
|
||||
assert_equal "abcdefghi", Notare::XmlSanitizer.sanitize(input)
|
||||
end
|
||||
|
||||
def test_removes_control_characters_0x0b_and_0x0c
|
||||
input = "hello\x0Bworld\x0Ctest"
|
||||
assert_equal "helloworldtest", Notare::XmlSanitizer.sanitize(input)
|
||||
end
|
||||
|
||||
def test_removes_control_characters_0x0e_to_0x1f
|
||||
input = "a\x0Eb\x0Fc\x10d\x11e\x1Ff"
|
||||
assert_equal "abcdef", Notare::XmlSanitizer.sanitize(input)
|
||||
end
|
||||
|
||||
def test_preserves_tab_character
|
||||
input = "hello\tworld"
|
||||
assert_equal "hello\tworld", Notare::XmlSanitizer.sanitize(input)
|
||||
end
|
||||
|
||||
def test_preserves_newline_character
|
||||
input = "hello\nworld"
|
||||
assert_equal "hello\nworld", Notare::XmlSanitizer.sanitize(input)
|
||||
end
|
||||
|
||||
def test_preserves_carriage_return_character
|
||||
input = "hello\rworld"
|
||||
assert_equal "hello\rworld", Notare::XmlSanitizer.sanitize(input)
|
||||
end
|
||||
|
||||
def test_preserves_crlf
|
||||
input = "hello\r\nworld"
|
||||
assert_equal "hello\r\nworld", Notare::XmlSanitizer.sanitize(input)
|
||||
end
|
||||
|
||||
def test_returns_nil_unchanged
|
||||
assert_nil Notare::XmlSanitizer.sanitize(nil)
|
||||
end
|
||||
|
||||
def test_returns_non_string_unchanged
|
||||
assert_equal 123, Notare::XmlSanitizer.sanitize(123)
|
||||
assert_equal :symbol, Notare::XmlSanitizer.sanitize(:symbol)
|
||||
end
|
||||
|
||||
def test_preserves_unicode_characters
|
||||
input = "café naïve 日本語 🎉"
|
||||
assert_equal "café naïve 日本語 🎉", Notare::XmlSanitizer.sanitize(input)
|
||||
end
|
||||
|
||||
def test_preserves_regular_text
|
||||
input = "Hello, World! This is normal text."
|
||||
assert_equal input, Notare::XmlSanitizer.sanitize(input)
|
||||
end
|
||||
|
||||
def test_handles_empty_string
|
||||
assert_equal "", Notare::XmlSanitizer.sanitize("")
|
||||
end
|
||||
|
||||
def test_real_world_case_stx_character
|
||||
# The actual case from the failed.docx: 0x02 (STX) character
|
||||
input = "infrastruktur\x02bidrag"
|
||||
assert_equal "infrastrukturbidrag", Notare::XmlSanitizer.sanitize(input)
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user