Sanitize invalid XML characters in text content
All checks were successful
CI Pipeline / build (push) Successful in 49s

Strip invalid XML 1.0 control characters (0x00-0x08, 0x0B-0x0C, 0x0E-0x1F)
from text to prevent corrupted docx files that fail to open in LibreOffice.

Fixes SAXParseException 'PCData Invalid Char value' errors.
This commit is contained in:
2026-01-22 09:10:33 +01:00
parent 8b4f538cbb
commit 64c8679044
6 changed files with 108 additions and 2 deletions

View File

@@ -3,6 +3,7 @@
require "nokogiri" require "nokogiri"
require_relative "notare/version" require_relative "notare/version"
require_relative "notare/xml_sanitizer"
require_relative "notare/nodes/base" require_relative "notare/nodes/base"
require_relative "notare/nodes/break" require_relative "notare/nodes/break"
require_relative "notare/nodes/hyperlink" require_relative "notare/nodes/hyperlink"

View File

@@ -8,7 +8,7 @@ module Notare
def initialize(text, bold: false, italic: false, underline: false, def initialize(text, bold: false, italic: false, underline: false,
strike: false, highlight: nil, color: nil, style: nil) strike: false, highlight: nil, color: nil, style: nil)
super() super()
@text = text @text = XmlSanitizer.sanitize(text)
@bold = bold @bold = bold
@italic = italic @italic = italic
@underline = underline @underline = underline

View File

@@ -1,5 +1,5 @@
# frozen_string_literal: true # frozen_string_literal: true
module Notare module Notare
VERSION = "0.0.5" VERSION = "0.0.6"
end end

View File

@@ -0,0 +1,15 @@
# frozen_string_literal: true
module Notare
module XmlSanitizer
# Invalid XML 1.0 characters: 0x00, 0x01-0x08, 0x0B-0x0C, 0x0E-0x1F
# Valid whitespace preserved: 0x09 (tab), 0x0A (LF), 0x0D (CR)
INVALID_XML_CHARS = /[\x00-\x08\x0B\x0C\x0E-\x1F]/
def self.sanitize(text)
return text unless text.is_a?(String)
text.gsub(INVALID_XML_CHARS, "")
end
end
end

View File

@@ -111,4 +111,21 @@ class ParagraphTest < Minitest::Test
# Newlines should be preserved in the text # Newlines should be preserved in the text
assert_includes xml, "Line 1\nLine 2\nLine 3" assert_includes xml, "Line 1\nLine 2\nLine 3"
end end
def test_invalid_xml_characters_are_stripped
xml = create_doc_and_read_xml do |doc|
doc.p "infrastruktur\x02bidrag"
doc.p "hello\x00world"
doc.p "test\x01\x03\x04value"
end
# Invalid characters should be stripped
assert_includes xml, "infrastrukturbidrag"
assert_includes xml, "helloworld"
assert_includes xml, "testvalue"
# Verify the XML is valid by parsing it (will raise if invalid)
doc = Nokogiri::XML(xml, &:strict)
assert doc.errors.empty?, "XML should be valid: #{doc.errors}"
end
end end

View File

@@ -0,0 +1,73 @@
# frozen_string_literal: true
require "test_helper"
class XmlSanitizerTest < Minitest::Test
def test_removes_null_character
assert_equal "hello", Notare::XmlSanitizer.sanitize("hel\x00lo")
end
def test_removes_control_characters_0x01_to_0x08
input = "a\x01b\x02c\x03d\x04e\x05f\x06g\x07h\x08i"
assert_equal "abcdefghi", Notare::XmlSanitizer.sanitize(input)
end
def test_removes_control_characters_0x0b_and_0x0c
input = "hello\x0Bworld\x0Ctest"
assert_equal "helloworldtest", Notare::XmlSanitizer.sanitize(input)
end
def test_removes_control_characters_0x0e_to_0x1f
input = "a\x0Eb\x0Fc\x10d\x11e\x1Ff"
assert_equal "abcdef", Notare::XmlSanitizer.sanitize(input)
end
def test_preserves_tab_character
input = "hello\tworld"
assert_equal "hello\tworld", Notare::XmlSanitizer.sanitize(input)
end
def test_preserves_newline_character
input = "hello\nworld"
assert_equal "hello\nworld", Notare::XmlSanitizer.sanitize(input)
end
def test_preserves_carriage_return_character
input = "hello\rworld"
assert_equal "hello\rworld", Notare::XmlSanitizer.sanitize(input)
end
def test_preserves_crlf
input = "hello\r\nworld"
assert_equal "hello\r\nworld", Notare::XmlSanitizer.sanitize(input)
end
def test_returns_nil_unchanged
assert_nil Notare::XmlSanitizer.sanitize(nil)
end
def test_returns_non_string_unchanged
assert_equal 123, Notare::XmlSanitizer.sanitize(123)
assert_equal :symbol, Notare::XmlSanitizer.sanitize(:symbol)
end
def test_preserves_unicode_characters
input = "café naïve 日本語 🎉"
assert_equal "café naïve 日本語 🎉", Notare::XmlSanitizer.sanitize(input)
end
def test_preserves_regular_text
input = "Hello, World! This is normal text."
assert_equal input, Notare::XmlSanitizer.sanitize(input)
end
def test_handles_empty_string
assert_equal "", Notare::XmlSanitizer.sanitize("")
end
def test_real_world_case_stx_character
# The actual case from the failed.docx: 0x02 (STX) character
input = "infrastruktur\x02bidrag"
assert_equal "infrastrukturbidrag", Notare::XmlSanitizer.sanitize(input)
end
end