From 64c8679044dd91ae33cf6c1e52b6d211358cdf1c Mon Sep 17 00:00:00 2001 From: mathias234 Date: Thu, 22 Jan 2026 09:10:33 +0100 Subject: [PATCH] Sanitize invalid XML characters in text content Strip invalid XML 1.0 control characters (0x00-0x08, 0x0B-0x0C, 0x0E-0x1F) from text to prevent corrupted docx files that fail to open in LibreOffice. Fixes SAXParseException 'PCData Invalid Char value' errors. --- lib/notare.rb | 1 + lib/notare/nodes/run.rb | 2 +- lib/notare/version.rb | 2 +- lib/notare/xml_sanitizer.rb | 15 ++++++++ test/paragraph_test.rb | 17 +++++++++ test/xml_sanitizer_test.rb | 73 +++++++++++++++++++++++++++++++++++++ 6 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 lib/notare/xml_sanitizer.rb create mode 100644 test/xml_sanitizer_test.rb diff --git a/lib/notare.rb b/lib/notare.rb index 99f6bfe..5702ea3 100644 --- a/lib/notare.rb +++ b/lib/notare.rb @@ -3,6 +3,7 @@ require "nokogiri" require_relative "notare/version" +require_relative "notare/xml_sanitizer" require_relative "notare/nodes/base" require_relative "notare/nodes/break" require_relative "notare/nodes/hyperlink" diff --git a/lib/notare/nodes/run.rb b/lib/notare/nodes/run.rb index 5cc9e6e..ef142ee 100644 --- a/lib/notare/nodes/run.rb +++ b/lib/notare/nodes/run.rb @@ -8,7 +8,7 @@ module Notare def initialize(text, bold: false, italic: false, underline: false, strike: false, highlight: nil, color: nil, style: nil) super() - @text = text + @text = XmlSanitizer.sanitize(text) @bold = bold @italic = italic @underline = underline diff --git a/lib/notare/version.rb b/lib/notare/version.rb index fb0d1c8..d5c8336 100644 --- a/lib/notare/version.rb +++ b/lib/notare/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module Notare - VERSION = "0.0.5" + VERSION = "0.0.6" end diff --git a/lib/notare/xml_sanitizer.rb b/lib/notare/xml_sanitizer.rb new file mode 100644 index 0000000..d3867f0 --- /dev/null +++ b/lib/notare/xml_sanitizer.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +module Notare + module XmlSanitizer + # Invalid XML 1.0 characters: 0x00, 0x01-0x08, 0x0B-0x0C, 0x0E-0x1F + # Valid whitespace preserved: 0x09 (tab), 0x0A (LF), 0x0D (CR) + INVALID_XML_CHARS = /[\x00-\x08\x0B\x0C\x0E-\x1F]/ + + def self.sanitize(text) + return text unless text.is_a?(String) + + text.gsub(INVALID_XML_CHARS, "") + end + end +end diff --git a/test/paragraph_test.rb b/test/paragraph_test.rb index 5615df2..a265249 100644 --- a/test/paragraph_test.rb +++ b/test/paragraph_test.rb @@ -111,4 +111,21 @@ class ParagraphTest < Minitest::Test # Newlines should be preserved in the text assert_includes xml, "Line 1\nLine 2\nLine 3" end + + def test_invalid_xml_characters_are_stripped + xml = create_doc_and_read_xml do |doc| + doc.p "infrastruktur\x02bidrag" + doc.p "hello\x00world" + doc.p "test\x01\x03\x04value" + end + + # Invalid characters should be stripped + assert_includes xml, "infrastrukturbidrag" + assert_includes xml, "helloworld" + assert_includes xml, "testvalue" + + # Verify the XML is valid by parsing it (will raise if invalid) + doc = Nokogiri::XML(xml, &:strict) + assert doc.errors.empty?, "XML should be valid: #{doc.errors}" + end end diff --git a/test/xml_sanitizer_test.rb b/test/xml_sanitizer_test.rb new file mode 100644 index 0000000..20865d5 --- /dev/null +++ b/test/xml_sanitizer_test.rb @@ -0,0 +1,73 @@ +# frozen_string_literal: true + +require "test_helper" + +class XmlSanitizerTest < Minitest::Test + def test_removes_null_character + assert_equal "hello", Notare::XmlSanitizer.sanitize("hel\x00lo") + end + + def test_removes_control_characters_0x01_to_0x08 + input = "a\x01b\x02c\x03d\x04e\x05f\x06g\x07h\x08i" + assert_equal "abcdefghi", Notare::XmlSanitizer.sanitize(input) + end + + def test_removes_control_characters_0x0b_and_0x0c + input = "hello\x0Bworld\x0Ctest" + assert_equal "helloworldtest", Notare::XmlSanitizer.sanitize(input) + end + + def test_removes_control_characters_0x0e_to_0x1f + input = "a\x0Eb\x0Fc\x10d\x11e\x1Ff" + assert_equal "abcdef", Notare::XmlSanitizer.sanitize(input) + end + + def test_preserves_tab_character + input = "hello\tworld" + assert_equal "hello\tworld", Notare::XmlSanitizer.sanitize(input) + end + + def test_preserves_newline_character + input = "hello\nworld" + assert_equal "hello\nworld", Notare::XmlSanitizer.sanitize(input) + end + + def test_preserves_carriage_return_character + input = "hello\rworld" + assert_equal "hello\rworld", Notare::XmlSanitizer.sanitize(input) + end + + def test_preserves_crlf + input = "hello\r\nworld" + assert_equal "hello\r\nworld", Notare::XmlSanitizer.sanitize(input) + end + + def test_returns_nil_unchanged + assert_nil Notare::XmlSanitizer.sanitize(nil) + end + + def test_returns_non_string_unchanged + assert_equal 123, Notare::XmlSanitizer.sanitize(123) + assert_equal :symbol, Notare::XmlSanitizer.sanitize(:symbol) + end + + def test_preserves_unicode_characters + input = "café naïve 日本語 🎉" + assert_equal "café naïve 日本語 🎉", Notare::XmlSanitizer.sanitize(input) + end + + def test_preserves_regular_text + input = "Hello, World! This is normal text." + assert_equal input, Notare::XmlSanitizer.sanitize(input) + end + + def test_handles_empty_string + assert_equal "", Notare::XmlSanitizer.sanitize("") + end + + def test_real_world_case_stx_character + # The actual case from the failed.docx: 0x02 (STX) character + input = "infrastruktur\x02bidrag" + assert_equal "infrastrukturbidrag", Notare::XmlSanitizer.sanitize(input) + end +end