diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..b18fd293 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: 'github-actions' + directory: '/' + schedule: + interval: 'weekly' diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 00000000..52349b44 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,29 @@ +name: Benchmark + +on: + - push + - pull_request + +jobs: + benchmark: + name: "Benchmark: Ruby ${{ matrix.ruby-version }}: ${{ matrix.runs-on }}" + strategy: + fail-fast: false + matrix: + ruby-version: + - '3.3' + runs-on: + - ubuntu-latest + runs-on: ${{ matrix.runs-on }} + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: ${{ matrix.ruby-version }} + - name: Install dependencies + run: | + bundle install + gem install rexml -v 3.2.6 + - name: Benchmark + run: | + rake benchmark diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..20ff87e7 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,30 @@ +name: Release +on: + push: + tags: + - "*" +jobs: + github: + name: GitHub + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + - name: Extract release note + run: | + ruby \ + -e 'print("## REXML "); \ + puts(ARGF.read.split(/^## /)[1]. \ + gsub(/ {.+?}/, ""). \ + gsub(/\[(.+?)\]\[.+?\]/) {$1})' \ + NEWS.md > release-note.md + - name: Upload to release + run: | + title=$(head -n1 release-note.md | sed -e 's/^## //') + tail -n +2 release-note.md > release-note-without-version.md + gh release create ${GITHUB_REF_NAME} \ + --discussion-category Announcements \ + --notes-file release-note-without-version.md \ + --title "${title}" + env: + GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 65a3bffd..0bd43457 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,7 +3,14 @@ on: - push - pull_request jobs: + ruby-versions-inplace: + uses: ruby/actions/.github/workflows/ruby_versions.yml@master + with: + engine: cruby-jruby + min_version: 2.5 + inplace: + needs: ruby-versions-inplace name: "Inplace: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -13,16 +20,14 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: - - "2.5" - - "2.6" - - "2.7" - - jruby + ruby-version: ${{ fromJson(needs.ruby-versions-inplace.outputs.versions) }} + exclude: + - {runs-on: macos-latest, ruby-version: 2.5} # include: # - runs-on: ubuntu-latest # ruby-version: truffleruby steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} @@ -30,7 +35,26 @@ jobs: - name: Test run: bundle exec rake test + frozen-string-literal: + name: frozen-string-literal + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: ruby + bundler-cache: true + - name: Test + run: bundle exec rake test RUBYOPT="--enable-frozen-string-literal" + + ruby-versions-gems: + uses: ruby/actions/.github/workflows/ruby_versions.yml@master + with: + engine: cruby-jruby + min_version: 2.6 # REXML is a default gem since Ruby 2.6 + gem: + needs: ruby-versions-gems name: "Gem: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -40,17 +64,26 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: - - "3.0" - - head + ruby-version: ${{ fromJson(needs.ruby-versions-gems.outputs.versions) }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} - name: Install as gem run: | rake install + - name: Install test dependencies on non-Windows + if: matrix.runs-on != 'windows-latest' + run: | + for gem in $(ruby -e 'puts ARGF.read[/^group :test do(.*)^end/m, 1].scan(/"(.+?)"/)' Gemfile); do + gem install ${gem} + done + - name: Install test dependencies on Windows + if: matrix.runs-on == 'windows-latest' + run: | + gem install test-unit + gem install test-unit-ruby-core - name: Test run: | ruby -run -e mkdir -- tmp @@ -62,17 +95,17 @@ jobs: name: "Document" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: - ruby-version: 2.7 + ruby-version: ruby - name: Install dependencies run: | bundle install - name: Build document run: | bundle exec rake warning:error rdoc - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 if: | github.event_name == 'push' with: diff --git a/Gemfile b/Gemfile index 54da2c0c..67f21dfb 100644 --- a/Gemfile +++ b/Gemfile @@ -4,3 +4,17 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } # Specify your gem's dependencies in rexml.gemspec gemspec + +group :development do + gem "bundler" + gem "rake" +end + +group :benchmark do + gem "benchmark_driver" +end + +group :test do + gem "test-unit" + gem "test-unit-ruby-core" +end diff --git a/NEWS.md b/NEWS.md index 84bbde2d..72318b7f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,15 +1,349 @@ # News +## 3.3.3 - 2024-08-01 {#version-3-3-3} + +### Improvements + + * Added support for detecting invalid XML that has unsupported + content before root element + * GH-184 + * Patch by NAITOH Jun. + + * Added support for `REXML::Security.entity_expansion_limit=` and + `REXML::Security.entity_expansion_text_limit=` in SAX2 and pull + parsers + * GH-187 + * Patch by NAITOH Jun. + + * Added more tests for invalid XMLs. + * GH-183 + * Patch by Watson. + + * Added more performance tests. + * Patch by Watson. + + * Improved parse performance. + * GH-186 + * Patch by tomoya ishida. + +### Thanks + + * NAITOH Jun + + * Watson + + * tomoya ishida + +## 3.3.2 - 2024-07-16 {#version-3-3-2} + +### Improvements + + * Improved parse performance. + * GH-160 + * Patch by NAITOH Jun. + + * Improved parse performance. + * GH-169 + * GH-170 + * GH-171 + * GH-172 + * GH-173 + * GH-174 + * GH-175 + * GH-176 + * GH-177 + * Patch by Watson. + + * Added support for raising a parse exception when an XML has extra + content after the root element. + * GH-161 + * Patch by NAITOH Jun. + + * Added support for raising a parse exception when an XML + declaration exists in wrong position. + * GH-162 + * Patch by NAITOH Jun. + + * Removed needless a space after XML declaration in pretty print mode. + * GH-164 + * Patch by NAITOH Jun. + + * Stopped to emit `:text` event after the root element. + * GH-167 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that SAX2 parser doesn't expand predefined entities for + `characters` callback. + * GH-168 + * Patch by NAITOH Jun. + +### Thanks + + * NAITOH Jun + + * Watson + +## 3.3.1 - 2024-06-25 {#version-3-3-1} + +### Improvements + + * Added support for detecting malformed top-level comments. + * GH-145 + * Patch by Hiroya Fujinami. + + * Improved `REXML::Element#attribute` performance. + * GH-146 + * Patch by Hiroya Fujinami. + + * Added support for detecting malformed `` comments. + * GH-147 + * Patch by Hiroya Fujinami. + + * Added support for detecting unclosed `DOCTYPE`. + * GH-152 + * Patch by Hiroya Fujinami. + + * Added `changlog_uri` metadata to gemspec. + * GH-156 + * Patch by fynsta. + + * Improved parse performance. + * GH-157 + * GH-158 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that large XML can't be parsed. + * GH-154 + * Patch by NAITOH Jun. + + * Fixed a bug that private constants are visible. + * GH-155 + * Patch by NAITOH Jun. + +### Thanks + + * Hiroya Fujinami + + * NAITOH Jun + + * fynsta + +## 3.3.0 - 2024-06-11 {#version-3-3-0} + +### Improvements + + * Added support for strscan 0.7.0 installed with Ruby 2.6. + * GH-142 + * Reported by Fernando Trigoso. + +### Thanks + + * Fernando Trigoso + +## 3.2.9 - 2024-06-09 {#version-3-2-9} + +### Improvements + + * Added support for old strscan. + * GH-132 + * Reported by Adam. + + * Improved attribute value parse performance. + * GH-135 + * Patch by NAITOH Jun. + + * Improved `REXML::Node#each_recursive` performance. + * GH-134 + * GH-139 + * Patch by Hiroya Fujinami. + + * Improved text parse performance. + * Reported by mprogrammer. + +### Thanks + + * Adam + * NAITOH Jun + * Hiroya Fujinami + * mprogrammer + +## 3.2.8 - 2024-05-16 {#version-3-2-8} + +### Fixes + + * Suppressed a warning + +## 3.2.7 - 2024-05-16 {#version-3-2-7} + +### Improvements + + * Improve parse performance by using `StringScanner`. + + * GH-106 + * GH-107 + * GH-108 + * GH-109 + * GH-112 + * GH-113 + * GH-114 + * GH-115 + * GH-116 + * GH-117 + * GH-118 + * GH-119 + * GH-121 + + * Patch by NAITOH Jun. + + * Improved parse performance when an attribute has many `<`s. + + * GH-126 + +### Fixes + + * XPath: Fixed a bug of `normalize_space(array)`. + + * GH-110 + * GH-111 + + * Patch by flatisland. + + * XPath: Fixed a bug that wrong position is used with nested path. + + * GH-110 + * GH-122 + + * Reported by jcavalieri. + * Patch by NAITOH Jun. + + * Fixed a bug that an exception message can't be generated for + invalid encoding XML. + + * GH-29 + * GH-123 + + * Reported by DuKewu. + * Patch by NAITOH Jun. + +### Thanks + + * NAITOH Jun + * flatisland + * jcavalieri + * DuKewu + +## 3.2.6 - 2023-07-27 {#version-3-2-6} + +### Improvements + + * Required Ruby 2.5 or later explicitly. + [GH-69][gh-69] + [Patch by Ivo Anjo] + + * Added documentation for maintenance cycle. + [GH-71][gh-71] + [Patch by Ivo Anjo] + + * Added tutorial. + [GH-77][gh-77] + [GH-78][gh-78] + [Patch by Burdette Lamar] + + * Improved performance and memory usage. + [GH-94][gh-94] + [Patch by fatkodima] + + * `REXML::Parsers::XPathParser#abbreviate`: Added support for + function arguments. + [GH-95][gh-95] + [Reported by pulver] + + * `REXML::Parsers::XPathParser#abbreviate`: Added support for string + literal that contains double-quote. + [GH-96][gh-96] + [Patch by pulver] + + * `REXML::Parsers::XPathParser#abbreviate`: Added missing `/` to + `:descendant_or_self/:self/:parent`. + [GH-97][gh-97] + [Reported by pulver] + + * `REXML::Parsers::XPathParser#abbreviate`: Added support for more patterns. + [GH-97][gh-97] + [Reported by pulver] + +### Fixes + + * Fixed a typo in NEWS. + [GH-72][gh-72] + [Patch by Spencer Goodman] + + * Fixed a typo in NEWS. + [GH-75][gh-75] + [Patch by Andrew Bromwich] + + * Fixed documents. + [GH-87][gh-87] + [Patch by Alexander Ilyin] + + * Fixed a bug that `Attriute` convert `'` and `'` even when + `attribute_quote: :quote` is used. + [GH-92][gh-92] + [Reported by Edouard Brière] + + * Fixed links in tutorial. + [GH-99][gh-99] + [Patch by gemmaro] + + +### Thanks + + * Ivo Anjo + + * Spencer Goodman + + * Andrew Bromwich + + * Burdette Lamar + + * Alexander Ilyin + + * Edouard Brière + + * fatkodima + + * pulver + + * gemmaro + +[gh-69]:https://github.com/ruby/rexml/issues/69 +[gh-71]:https://github.com/ruby/rexml/issues/71 +[gh-72]:https://github.com/ruby/rexml/issues/72 +[gh-75]:https://github.com/ruby/rexml/issues/75 +[gh-77]:https://github.com/ruby/rexml/issues/77 +[gh-87]:https://github.com/ruby/rexml/issues/87 +[gh-92]:https://github.com/ruby/rexml/issues/92 +[gh-94]:https://github.com/ruby/rexml/issues/94 +[gh-95]:https://github.com/ruby/rexml/issues/95 +[gh-96]:https://github.com/ruby/rexml/issues/96 +[gh-97]:https://github.com/ruby/rexml/issues/97 +[gh-98]:https://github.com/ruby/rexml/issues/98 +[gh-99]:https://github.com/ruby/rexml/issues/99 + ## 3.2.5 - 2021-04-05 {#version-3-2-5} ### Improvements * Add more validations to XPath parser. - * `require "rexml/docuemnt"` by default. + * `require "rexml/document"` by default. [GitHub#36][Patch by Koichi ITO] - * Don't add `#dcloe` method to core classes globally. + * Don't add `#dclone` method to core classes globally. [GitHub#37][Patch by Akira Matsuda] * Add more documentations. diff --git a/README.md b/README.md index 27da0e49..e8ab5082 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ REXML supports both tree and stream document parsing. Stream parsing is faster ( ## API -See the {API documentation}[https://ruby.github.io/rexml/] +See the [API documentation](https://ruby.github.io/rexml/). ## Usage @@ -33,6 +33,15 @@ doc = Document.new string So parsing a string is just as easy as parsing a file. +## Support + +REXML support follows the same maintenance cycle as Ruby releases, as shown on . + +If you are running on an end-of-life Ruby, do not expect modern REXML releases to be compatible with it; in fact, it's recommended that you DO NOT use this gem, and instead use the REXML version that came bundled with your end-of-life Ruby version. + +The `required_ruby_version` on the gemspec is kept updated on a [best-effort basis](https://github.com/ruby/rexml/pull/70) by the community. +Up to version 3.2.5, this information was not set. That version [is known broken with at least Ruby < 2.3](https://github.com/ruby/rexml/issues/69). + ## Development After checking out the repo, run `rake test` to run the tests. diff --git a/Rakefile b/Rakefile index 7143e754..76a56296 100644 --- a/Rakefile +++ b/Rakefile @@ -28,3 +28,42 @@ RDoc::Task.new do |rdoc| end load "#{__dir__}/tasks/tocs.rake" + +benchmark_tasks = [] +namespace :benchmark do + Dir.glob("benchmark/*.yaml").sort.each do |yaml| + name = File.basename(yaml, ".*") + env = { + "RUBYLIB" => nil, + "BUNDLER_ORIG_RUBYLIB" => nil, + } + command_line = [ + RbConfig.ruby, "-v", "-S", "benchmark-driver", File.expand_path(yaml), + ] + + desc "Run #{name} benchmark" + task name do + puts("```") + sh(env, *command_line) + puts("```") + end + benchmark_tasks << "benchmark:#{name}" + + case name + when /\Aparse/ + namespace name do + desc "Run #{name} benchmark: small" + task :small do + puts("```") + sh(env.merge("N_ELEMENTS" => "500", "N_ATTRIBUTES" => "1"), + *command_line) + puts("```") + end + benchmark_tasks << "benchmark:#{name}:small" + end + end + end +end + +desc "Run all benchmarks" +task :benchmark => benchmark_tasks diff --git a/benchmark/attribute.yaml b/benchmark/attribute.yaml new file mode 100644 index 00000000..5dd7fded --- /dev/null +++ b/benchmark/attribute.yaml @@ -0,0 +1,38 @@ +loop_count: 1000 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + xml_source = "" + 100.times do + xml_source = "#{xml_source}" + end + xml_source = "#{xml_source}" + + document = REXML::Document.new(xml_source) + deepest_node = document.elements["//deepest"] + +benchmark: + with_ns: deepest_node.attribute("with_ns", "xyz") + without_ns: deepest_node.attribute("without_ns") diff --git a/benchmark/each_recursive.yaml b/benchmark/each_recursive.yaml new file mode 100644 index 00000000..c745f8ce --- /dev/null +++ b/benchmark/each_recursive.yaml @@ -0,0 +1,40 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + xml_source = +"" + 100.times do + x_node_source = "" + 100.times do + x_node_source = "#{x_node_source}" + end + xml_source << x_node_source + end + xml_source << "" + + document = REXML::Document.new(xml_source) + +benchmark: + each_recursive: document.each_recursive { |_| } diff --git a/benchmark/gt.yaml b/benchmark/gt.yaml new file mode 100644 index 00000000..3f6af739 --- /dev/null +++ b/benchmark/gt.yaml @@ -0,0 +1,34 @@ +loop_count: 10 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require "rexml" + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require "rexml" + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require "rexml" + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require "rexml" + RubyVM::YJIT.enable + +prelude: | + require "rexml/document" + + n = 10000 + gts = ">" * n + in_attribute = "" + in_text = "#{gts}" + +benchmark: + "attribute": REXML::Document.new(in_attribute) + "text": REXML::Document.new(in_text) diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml new file mode 100644 index 00000000..f2c7d336 --- /dev/null +++ b/benchmark/parse.yaml @@ -0,0 +1,57 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + require 'rexml/parsers/sax2parser' + require 'rexml/parsers/pullparser' + require 'rexml/parsers/streamparser' + require 'rexml/streamlistener' + + n_elements = Integer(ENV.fetch("N_ELEMENTS", "5000"), 10) + n_attributes = Integer(ENV.fetch("N_ATTRIBUTES", "2"), 10) + + def build_xml(n_elements, n_attributes) + xml = '' + n_elements.times do |i| + xml << '' + end + xml << '' + end + xml = build_xml(n_elements, n_attributes) + + class Listener + include REXML::StreamListener + end + +benchmark: + 'dom' : REXML::Document.new(xml) + 'sax' : REXML::Parsers::SAX2Parser.new(xml).parse + 'pull' : | + parser = REXML::Parsers::PullParser.new(xml) + while parser.has_next? + parser.pull + end + 'stream' : REXML::Parsers::StreamParser.new(xml, Listener.new).parse diff --git a/doc/rexml/tasks/rdoc/element.rdoc b/doc/rexml/tasks/rdoc/element.rdoc index f229275f..4b3609b0 100644 --- a/doc/rexml/tasks/rdoc/element.rdoc +++ b/doc/rexml/tasks/rdoc/element.rdoc @@ -369,7 +369,7 @@ to retrieve the first text node in a specified element: Use method {Element#has_text?}[../../../../REXML/Element.html#method-i-has_text-3F] -to determine whethe the element has text: +to determine whether the element has text: e = REXML::Element.new('foo') e.has_text? # => false @@ -486,7 +486,7 @@ to remove a specific namespace from the element: Use method {Element#namespace}[../../../../REXML/Element.html#method-i-namespace] -to retrieve a speficic namespace URI for the element: +to retrieve a specific namespace URI for the element: xml_string = <<-EOT diff --git a/doc/rexml/tutorial.rdoc b/doc/rexml/tutorial.rdoc new file mode 100644 index 00000000..c85a70d0 --- /dev/null +++ b/doc/rexml/tutorial.rdoc @@ -0,0 +1,1358 @@ += \REXML Tutorial + +== Why \REXML? + +- Ruby's \REXML library is part of the Ruby distribution, + so using it requires no gem installations. +- \REXML is fully maintained. +- \REXML is mature, having been in use for long years. + +== To Include, or Not to Include? + +REXML is a module. +To use it, you must require it: + + require 'rexml' # => true + +If you do not also include it, you must fully qualify references to REXML: + + REXML::Document # => REXML::Document + +If you also include the module, you may optionally omit REXML::: + + include REXML + Document # => REXML::Document + REXML::Document # => REXML::Document + +== Preliminaries + +All examples here assume that the following code has been executed: + + require 'rexml' + include REXML + +The source XML for many examples here is from file +{books.xml}[https://www.w3schools.com/xml/books.xml] at w3schools.com. +You may find it convenient to open that page in a new tab +(Ctrl-click in some browsers). + +Note that your browser may display the XML with modified whitespace +and without the XML declaration, which in this case is: + + + +For convenience, we capture the XML into a string variable: + + require 'open-uri' + source_string = URI.open('https://www.w3schools.com/xml/books.xml').read + +And into a file: + + File.write('source_file.xml', source_string) + +Throughout these examples, variable +doc+ will hold only the document +derived from these sources: + + doc = Document.new(source_string) + +== Parsing \XML \Source + +=== Parsing a Document + +Use method REXML::Document::new to parse XML source. + +The source may be a string: + + doc = Document.new(source_string) + +Or an \IO stream: + + doc = File.open('source_file.xml', 'r') do |io| + Document.new(io) + end + +Method URI.open returns a StringIO object, +so the source can be from a web page: + + require 'open-uri' + io = URI.open("https://www.w3schools.com/xml/books.xml") + io.class # => StringIO + doc = Document.new(io) + +For any of these sources, the returned object is an REXML::Document: + + doc # => ... + doc.class # => REXML::Document + +Note: 'UNDEFINED' is the "name" displayed for a document, +even though doc.name returns an empty string "". + +A parsed document may produce \REXML objects of many classes, +but the two that are likely to be of greatest interest are +REXML::Document and REXML::Element. +These two classes are covered in great detail in this tutorial. + +=== Context (Parsing Options) + +The context for parsing a document is a hash that influences +the way the XML is read and stored. + +The context entries are: + +- +:respect_whitespace+: controls treatment of whitespace. +- +:compress_whitespace+: determines whether whitespace is compressed. +- +:ignore_whitespace_nodes+: determines whether whitespace-only nodes are to be ignored. +- +:raw+: controls treatment of special characters and entities. + +See {Element Context}[../context_rdoc.html]. + +== Exploring the Document + +An REXML::Document object represents an XML document. + +The object inherits from its ancestor classes: + +- REXML::Child (includes module REXML::Node) + - REXML::Parent (includes module {Enumerable}[rdoc-ref:Enumerable]). + - REXML::Element (includes module REXML::Namespace). + - REXML::Document + +This section covers only those properties and methods that are unique to a document +(that is, not inherited or included). + +=== Document Properties + +A document has several properties (other than its children); + +- Document type. +- Node type. +- Name. +- Document. +- XPath + +[Document Type] + + A document may have a document type: + + my_xml = '' + my_doc = Document.new(my_xml) + doc_type = my_doc.doctype + doc_type.class # => REXML::DocType + doc_type.to_s # => "" + +[Node Type] + + A document also has a node type (always +:document+): + + doc.node_type # => :document + +[Name] + + A document has a name (always an empty string): + + doc.name # => "" + +[Document] + + \Method REXML::Document#document returns +self+: + + doc.document == doc # => true + + An object of a different class (\REXML::Element or \REXML::Child) + may have a document, which is the document to which the object belongs; + if so, that document will be an \REXML::Document object. + + doc.root.document.class # => REXML::Document + +[XPath] + + \method REXML::Element#xpath returns the string xpath to the element, + relative to its most distant ancestor: + + doc.root.class # => REXML::Element + doc.root.xpath # => "/bookstore" + doc.root.texts.first # => "\n\n" + doc.root.texts.first.xpath # => "/bookstore/text()" + + If there is no ancestor, returns the expanded name of the element: + + Element.new('foo').xpath # => "foo" + +=== Document Children + +A document may have children of these types: + +- XML declaration. +- Root element. +- Text. +- Processing instructions. +- Comments. +- CDATA. + +[XML Declaration] + + A document may an XML declaration, which is stored as an REXML::XMLDecl object: + + doc.xml_decl # => + doc.xml_decl.class # => REXML::XMLDecl + + Document.new('').xml_decl # => + + my_xml = '"' + my_doc = Document.new(my_xml) + xml_decl = my_doc.xml_decl + xml_decl.to_s # => "" + + The version, encoding, and stand-alone values may be retrieved separately: + + my_doc.version # => "1.0" + my_doc.encoding # => "UTF-8" + my_doc.stand_alone? # => "yes" + +[Root Element] + + A document may have a single element child, called the _root_ _element_, + which is stored as an REXML::Element object; + it may be retrieved with method +root+: + + doc.root # => ... + doc.root.class # => REXML::Element + + Document.new('').root # => nil + +[Text] + + A document may have text passages, each of which is stored + as an REXML::Text object: + + doc.texts.each {|t| p [t.class, t] } + + Output: + + [REXML::Text, "\n"] + +[Processing Instructions] + + A document may have processing instructions, which are stored + as REXML::Instruction objects: + + + + Output: + + [REXML::Instruction, ] + [REXML::Instruction, ] + +[Comments] + + A document may have comments, which are stored + as REXML::Comment objects: + + my_xml = <<-EOT + + + EOT + my_doc = Document.new(my_xml) + my_doc.comments.each {|c| p [c.class, c] } + + Output: + + [REXML::Comment, # ... , @string="foo">] + [REXML::Comment, # ... , @string="bar">] + +[CDATA] + + A document may have CDATA entries, which are stored + as REXML::CData objects: + + my_xml = <<-EOT + + + EOT + my_doc = Document.new(my_xml) + my_doc.cdatas.each {|cd| p [cd.class, cd] } + + Output: + + [REXML::CData, "foo"] + [REXML::CData, "bar"] + +The payload of a document is a tree of nodes, descending from the root element: + + doc.root.children.each do |child| + p [child, child.class] + end + +Output: + + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + +== Exploring an Element + +An REXML::Element object represents an XML element. + +The object inherits from its ancestor classes: + +- REXML::Child (includes module REXML::Node) + - REXML::Parent (includes module {Enumerable}[rdoc-ref:Enumerable]). + - REXML::Element (includes module REXML::Namespace). + +This section covers methods: + +- Defined in REXML::Element itself. +- Inherited from REXML::Parent and REXML::Child. +- Included from REXML::Node. + +=== Inside the Element + +[Brief String Representation] + + Use method REXML::Element#inspect to retrieve a brief string representation. + + doc.root.inspect # => " ... " + + The ellipsis (...) indicates that the element has children. + When there are no children, the ellipsis is omitted: + + Element.new('foo').inspect # => "" + + If the element has attributes, those are also included: + + doc.root.elements.first.inspect # => " ... " + +[Extended String Representation] + + Use inherited method REXML::Child.bytes to retrieve an extended + string representation. + + doc.root.bytes # => "\n\n\n Codestin Search App\n Giada De Laurentiis\n 2005\n 30.00\n\n\n\n Codestin Search App\n J K. Rowling\n 2005\n 29.99\n\n\n\n Codestin Search App\n James McGovern\n Per Bothner\n Kurt Cagle\n James Linn\n Vaidyanathan Nagarajan\n 2003\n 49.99\n\n\n\n Codestin Search App\n Erik T. Ray\n 2003\n 39.95\n\n\n" + +[Node Type] + + Use method REXML::Element#node_type to retrieve the node type (always +:element+): + + doc.root.node_type # => :element + +[Raw Mode] + + Use method REXML::Element#raw to retrieve whether (+true+ or +nil+) + raw mode is set. + + doc.root.raw # => nil + +[Context] + + Use method REXML::Element#context to retrieve the context hash + (see {Element Context}[../context_rdoc.html]): + + doc.root.context # => {} + +=== Relationships + +An element may have: + +- Ancestors. +- Siblings. +- Children. + +==== Ancestors + +[Containing Document] + + Use method REXML::Element#document to retrieve the containing document, if any: + + ele = doc.root.elements.first # => ... + ele.document # => ... + ele = Element.new('foo') # => + ele.document # => nil + +[Root Element] + + Use method REXML::Element#root to retrieve the root element: + + ele = doc.root.elements.first # => ... + ele.root # => ... + ele = Element.new('foo') # => + ele.root # => + +[Root Node] + + Use method REXML::Element#root_node to retrieve the most distant ancestor, + which is the containing document, if any, otherwise the root element: + + ele = doc.root.elements.first # => ... + ele.root_node # => ... + ele = Element.new('foo') # => + ele.root_node # => + +[Parent] + + Use inherited method REXML::Child#parent to retrieve the parent + + ele = doc.root # => ... + ele.parent # => ... + ele = doc.root.elements.first # => ... + ele.parent # => ... + + Use included method REXML::Node#index_in_parent to retrieve the index + of the element among all of its parents children (not just the element children). + Note that while the index for doc.root.elements[n] is 1-based, + the returned index is 0-based. + + doc.root.children # => + # ["\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n"] + ele = doc.root.elements[1] # => ... + ele.index_in_parent # => 2 + ele = doc.root.elements[2] # => ... + ele.index_in_parent# => 4 + +==== Siblings + +[Next Element] + + Use method REXML::Element#next_element to retrieve the first following + sibling that is itself an element (+nil+ if there is none): + + ele = doc.root.elements[1] + while ele do + p [ele.class, ele] + ele = ele.next_element + end + p ele + + Output: + + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + +[Previous Element] + + Use method REXML::Element#previous_element to retrieve the first preceding + sibling that is itself an element (+nil+ if there is none): + + ele = doc.root.elements[4] + while ele do + p [ele.class, ele] + ele = ele.previous_element + end + p ele + + Output: + + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + +[Next Node] + + Use included method REXML::Node.next_sibling_node + (or its alias next_sibling) to retrieve the first following node + regardless of its class: + + node = doc.root.children[0] + while node do + p [node.class, node] + node = node.next_sibling + end + p node + + Output: + + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + +[Previous Node] + + Use included method REXML::Node.previous_sibling_node + (or its alias previous_sibling) to retrieve the first preceding node + regardless of its class: + + node = doc.root.children[-1] + while node do + p [node.class, node] + node = node.previous_sibling + end + p node + + Output: + + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + +==== Children + +[Child Count] + + Use inherited method REXML::Parent.size to retrieve the count + of nodes (of all types) in the element: + + doc.root.size # => 9 + +[Child Nodes] + + Use inherited method REXML::Parent.children to retrieve an array + of the child nodes (of all types): + + doc.root.children # => + # ["\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n"] + +[Child at Index] + + Use method REXML::Element#[] to retrieve the child at a given numerical index, + or +nil+ if there is no such child: + + doc.root[0] # => "\n\n" + doc.root[1] # => ... + doc.root[7] # => ... + doc.root[8] # => "\n\n" + + doc.root[-1] # => "\n\n" + doc.root[-2] # => ... + + doc.root[50] # => nil + +[Index of Child] + + Use method REXML::Parent#index to retrieve the zero-based child index + of the given object, or #size - 1 if there is no such child: + + ele = doc.root # => ... + ele.index(ele[0]) # => 0 + ele.index(ele[1]) # => 1 + ele.index(ele[7]) # => 7 + ele.index(ele[8]) # => 8 + + ele.index(ele[-1]) # => 8 + ele.index(ele[-2]) # => 7 + + ele.index(ele[50]) # => 8 + +[Element Children] + + Use method REXML::Element#has_elements? to retrieve whether the element + has element children: + + doc.root.has_elements? # => true + REXML::Element.new('foo').has_elements? # => false + + Use method REXML::Element#elements to retrieve the REXML::Elements object + containing the element children: + + eles = doc.root.elements + eles # => # ... > + eles.size # => 4 + eles.each {|e| p [e.class], e } + + Output: + + [ ... , + ... , + ... , + ... + ] + +Note that while in this example, all the element children of the root element are +elements of the same name, 'book', that is not true of all documents; +a root element (or any other element) may have any mixture of child elements. + +[CDATA Children] + + Use method REXML::Element#cdatas to retrieve a frozen array of CDATA children: + + my_xml = <<-EOT + + + + + EOT + my_doc = REXML::Document.new(my_xml) + cdatas my_doc.root.cdatas + cdatas.frozen? # => true + cdatas.map {|cd| cd.class } # => [REXML::CData, REXML::CData] + +[Comment Children] + + Use method REXML::Element#comments to retrieve a frozen array of comment children: + + my_xml = <<-EOT + + + + + EOT + my_doc = REXML::Document.new(my_xml) + comments = my_doc.root.comments + comments.frozen? # => true + comments.map {|c| c.class } # => [REXML::Comment, REXML::Comment] + comments.map {|c| c.to_s } # => ["foo", "bar"] + +[Processing Instruction Children] + + Use method REXML::Element#instructions to retrieve a frozen array + of processing instruction children: + + my_xml = <<-EOT + + + + + EOT + my_doc = REXML::Document.new(my_xml) + instrs = my_doc.root.instructions + instrs.frozen? # => true + instrs.map {|i| i.class } # => [REXML::Instruction, REXML::Instruction] + instrs.map {|i| i.to_s } # => ["", ""] + +[Text Children] + + Use method REXML::Element#has_text? to retrieve whether the element + has text children: + + doc.root.has_text? # => true + REXML::Element.new('foo').has_text? # => false + + Use method REXML::Element#texts to retrieve a frozen array of text children: + + my_xml = 'textmore' + my_doc = REXML::Document.new(my_xml) + texts = my_doc.root.texts + texts.frozen? # => true + texts.map {|t| t.class } # => [REXML::Text, REXML::Text] + texts.map {|t| t.to_s } # => ["text", "more"] + +[Parenthood] + + Use inherited method REXML::Parent.parent? to retrieve whether the element is a parent; + always returns +true+; only REXML::Child#parent returns +false+. + + doc.root.parent? # => true + +=== Element Attributes + +Use method REXML::Element#has_attributes? to return whether the element +has attributes: + + ele = doc.root # => ... + ele.has_attributes? # => false + ele = ele.elements.first # => ... + ele.has_attributes? # => true + +Use method REXML::Element#attributes to return the hash +containing the attributes for the element. +Each hash key is a string attribute name; +each hash value is an REXML::Attribute object. + + ele = doc.root # => ... + attrs = ele.attributes # => {} + + ele = ele.elements.first # => ... + attrs = ele.attributes # => {"category"=>category='cooking'} + attrs.size # => 1 + attr_name = attrs.keys.first # => "category" + attr_name.class # => String + attr_value = attrs.values.first # => category='cooking' + attr_value.class # => REXML::Attribute + +Use method REXML::Element#[] to retrieve the string value for a given attribute, +which may be given as either a string or a symbol: + + ele = doc.root.elements.first # => ... + attr_value = ele['category'] # => "cooking" + attr_value.class # => String + ele['nosuch'] # => nil + +Use method REXML::Element#attribute to retrieve the value of a named attribute: + + my_xml = "" + my_doc = REXML::Document.new(my_xml) + my_doc.root.attribute("x") # => x='x' + my_doc.root.attribute("x", "a") # => a:x='a:x' + +== Whitespace + +Use method REXML::Element#ignore_whitespace_nodes to determine whether +whitespace nodes were ignored when the XML was parsed; +returns +true+ if so, +nil+ otherwise. + +Use method REXML::Element#whitespace to determine whether whitespace +is respected for the element; returns +true+ if so, +false+ otherwise. + +== Namespaces + +Use method REXML::Element#namespace to retrieve the string namespace URI +for the element, which may derive from one of its ancestors: + + xml_string = <<-EOT + + + + + + + EOT + d = Document.new(xml_string) + b = d.elements['//b'] + b.namespace # => "1" + b.namespace('y') # => "2" + b.namespace('nosuch') # => nil + +Use method REXML::Element#namespaces to retrieve a hash of all defined namespaces +in the element and its ancestors: + + xml_string = <<-EOT + + + + + + + EOT + d = Document.new(xml_string) + d.elements['//a'].namespaces # => {"x"=>"1", "y"=>"2"} + d.elements['//b'].namespaces # => {"x"=>"1", "y"=>"2"} + d.elements['//c'].namespaces # => {"x"=>"1", "y"=>"2", "z"=>"3"} + +Use method REXML::Element#prefixes to retrieve an array of the string prefixes (names) +of all defined namespaces in the element and its ancestors: + + xml_string = <<-EOT + + + + + + + EOT + d = Document.new(xml_string, {compress_whitespace: :all}) + d.elements['//a'].prefixes # => ["x", "y"] + d.elements['//b'].prefixes # => ["x", "y"] + d.elements['//c'].prefixes # => ["x", "y", "z"] + +== Traversing + +You can use certain methods to traverse children of the element. +Each child that meets given criteria is yielded to the given block. + +[Traverse All Children] + + Use inherited method REXML::Parent#each (or its alias #each_child) to traverse + all children of the element: + + doc.root.each {|child| p [child.class, child] } + + Output: + + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + +[Traverse Element Children] + + Use method REXML::Element#each_element to traverse only the element children + of the element: + + doc.root.each_element {|e| p [e.class, e] } + + Output: + + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + +[Traverse Element Children with Attribute] + + Use method REXML::Element#each_element_with_attribute with the single argument + +attr_name+ to traverse each element child that has the given attribute: + + my_doc = Document.new '' + my_doc.root.each_element_with_attribute('id') {|e| p [e.class, e] } + + Output: + + [REXML::Element, ] + [REXML::Element, ] + [REXML::Element, ] + + Use the same method with a second argument +value+ to traverse + each element child element that has the given attribute and value: + + my_doc.root.each_element_with_attribute('id', '1') {|e| p [e.class, e] } + + Output: + + [REXML::Element, ] + [REXML::Element, ] + + Use the same method with a third argument +max+ to traverse + no more than the given number of element children: + + my_doc.root.each_element_with_attribute('id', '1', 1) {|e| p [e.class, e] } + + Output: + + [REXML::Element, ] + + Use the same method with a fourth argument +xpath+ to traverse + only those element children that match the given xpath: + + my_doc.root.each_element_with_attribute('id', '1', 2, '//d') {|e| p [e.class, e] } + + Output: + + [REXML::Element, ] + +[Traverse Element Children with Text] + + Use method REXML::Element#each_element_with_text with no arguments + to traverse those element children that have text: + + my_doc = Document.new 'bbd' + my_doc.root.each_element_with_text {|e| p [e.class, e] } + + Output: + + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + + Use the same method with the single argument +text+ to traverse + those element children that have exactly that text: + + my_doc.root.each_element_with_text('b') {|e| p [e.class, e] } + + Output: + + [REXML::Element, ... ] + [REXML::Element, ... ] + + Use the same method with additional second argument +max+ to traverse + no more than the given number of element children: + + my_doc.root.each_element_with_text('b', 1) {|e| p [e.class, e] } + + Output: + + [REXML::Element, ... ] + + Use the same method with additional third argument +xpath+ to traverse + only those element children that also match the given xpath: + + my_doc.root.each_element_with_text('b', 2, '//c') {|e| p [e.class, e] } + + Output: + + [REXML::Element, ... ] + +[Traverse Element Children's Indexes] + + Use inherited method REXML::Parent#each_index to traverse all children's indexes + (not just those of element children): + + doc.root.each_index {|i| print i } + + Output: + + 012345678 + +[Traverse Children Recursively] + + Use included method REXML::Node#each_recursive to traverse all children recursively: + + doc.root.each_recursive {|child| p [child.class, child] } + + Output: + + [REXML::Element, ... ] + [REXML::Element, Codestin Search App + Giada De Laurentiis + 2005 + 30.00 + + + + Codestin Search App + J K. Rowling + 2005 + 29.99 + + + + Codestin Search App + James McGovern + Per Bothner + Kurt Cagle + James Linn + Vaidyanathan Nagarajan + 2003 + 49.99 + + + + Codestin Search App + Erik T. Ray + 2003 + 39.95 + + + diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 8933a013..11893a95 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative "namespace" require_relative 'text' @@ -13,9 +13,6 @@ class Attribute # The element to which this attribute belongs attr_reader :element - # The normalized value of this attribute. That is, the attribute with - # entities intact. - attr_writer :normalized PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um @@ -122,10 +119,13 @@ def hash # b = Attribute.new( "ns:x", "y" ) # b.to_string # -> "ns:x='y'" def to_string + value = to_s if @element and @element.context and @element.context[:attribute_quote] == :quote - %Q^#@expanded_name="#{to_s().gsub(/"/, '"')}"^ + value = value.gsub('"', '"') if value.include?('"') + %Q^#@expanded_name="#{value}"^ else - "#@expanded_name='#{to_s().gsub(/'/, ''')}'" + value = value.gsub("'", ''') if value.include?("'") + "#@expanded_name='#{value}'" end end @@ -141,7 +141,6 @@ def to_s return @normalized if @normalized @normalized = Text::normalize( @unnormalized, doctype ) - @unnormalized = nil @normalized end @@ -150,10 +149,16 @@ def to_s def value return @unnormalized if @unnormalized @unnormalized = Text::unnormalize( @normalized, doctype ) - @normalized = nil @unnormalized end + # The normalized value of this attribute. That is, the attribute with + # entities intact. + def normalized=(new_normalized) + @normalized = new_normalized + @unnormalized = nil + end + # Returns a copy of this attribute def clone Attribute.new self @@ -190,7 +195,7 @@ def node_type end def inspect - rv = "" + rv = +"" write( rv ) rv end diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 2edeb987..b1caa020 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -69,7 +69,7 @@ class Document < Element # d.to_s # => "FooBar" # # When argument +document+ is given, it must be an existing - # document object, whose context and attributes (but not chidren) + # document object, whose context and attributes (but not children) # are cloned into the new document: # # d = REXML::Document.new(xml_string) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 4c21dbd5..a5808d7c 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -7,14 +7,6 @@ require_relative "parseexception" module REXML - # An implementation note about namespaces: - # As we parse, when we find namespaces we put them in a hash and assign - # them a unique ID. We then convert the namespace prefix for the node - # to the unique ID. This makes namespace lookup much faster for the - # cost of extra memory use. We save the namespace prefix for the - # context node and convert it back when we write it. - @@namespaces = {} - # An \REXML::Element object represents an XML element. # # An element: @@ -989,7 +981,7 @@ def previous_element # :call-seq: # has_text? -> true or false # - # Returns +true if the element has one or more text noded, + # Returns +true+ if the element has one or more text noded, # +false+ otherwise: # # d = REXML::Document.new 'text' @@ -1006,7 +998,7 @@ def has_text? # text(xpath = nil) -> text_string or nil # # Returns the text string from the first text node child - # in a specified element, if it exists, # +nil+ otherwise. + # in a specified element, if it exists, +nil+ otherwise. # # With no argument, returns the text from the first text node in +self+: # @@ -1014,7 +1006,7 @@ def has_text? # d.root.text.class # => String # d.root.text # => "some text " # - # With argument +xpath+, returns text from the the first text node + # With argument +xpath+, returns text from the first text node # in the element that matches +xpath+: # # d.root.text(1) # => "this is bold!" @@ -1284,16 +1276,11 @@ def [](name_or_index) # document.root.attribute("x", "a") # => a:x='a:x' # def attribute( name, namespace=nil ) - prefix = nil - if namespaces.respond_to? :key - prefix = namespaces.key(namespace) if namespace - else - prefix = namespaces.index(namespace) if namespace - end + prefix = namespaces.key(namespace) if namespace prefix = nil if prefix == 'xmlns' ret_val = - attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) + attributes.get_attribute( prefix ? "#{prefix}:#{name}" : name ) return ret_val unless ret_val.nil? return nil if prefix.nil? diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb index 89a9e84c..573db691 100644 --- a/lib/rexml/entity.rb +++ b/lib/rexml/entity.rb @@ -132,24 +132,34 @@ def to_s # then: # doctype.entity('yada').value #-> "nanoo bar nanoo" def value - if @value - matches = @value.scan(PEREFERENCE_RE) - rv = @value.clone - if @parent - sum = 0 - matches.each do |entity_reference| - entity_value = @parent.entity( entity_reference[0] ) - if sum + entity_value.bytesize > Security.entity_expansion_text_limit - raise "entity expansion has grown too large" - else - sum += entity_value.bytesize - end - rv.gsub!( /%#{entity_reference.join};/um, entity_value ) + @resolved_value ||= resolve_value + end + + def parent=(other) + @resolved_value = nil + super + end + + private + def resolve_value + return nil if @value.nil? + return @value unless @value.match?(PEREFERENCE_RE) + + matches = @value.scan(PEREFERENCE_RE) + rv = @value.clone + if @parent + sum = 0 + matches.each do |entity_reference| + entity_value = @parent.entity( entity_reference[0] ) + if sum + entity_value.bytesize > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" + else + sum += entity_value.bytesize end + rv.gsub!( /%#{entity_reference.join};/um, entity_value ) end - return rv end - nil + rv end end diff --git a/lib/rexml/formatters/pretty.rb b/lib/rexml/formatters/pretty.rb index 562ef946..a838d835 100644 --- a/lib/rexml/formatters/pretty.rb +++ b/lib/rexml/formatters/pretty.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative 'default' module REXML @@ -58,7 +58,7 @@ def write_element(node, output) skip = false if compact if node.children.inject(true) {|s,c| s & c.kind_of?(Text)} - string = "" + string = +"" old_level = @level @level = 0 node.children.each { |child| write( child, string ) } @@ -111,7 +111,7 @@ def write_document( node, output ) # itself, then we don't need a carriage return... which makes this # logic more complex. node.children.each { |child| - next if child == node.children[-1] and child.instance_of?(Text) + next if child.instance_of?(Text) unless child == node.children[0] or child.instance_of?(Text) or (child == node.children[1] and !node.children[0].writethis) output << "\n" diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb index 77926bf2..4c114616 100644 --- a/lib/rexml/functions.rb +++ b/lib/rexml/functions.rb @@ -262,11 +262,10 @@ def Functions::string_length( string ) string(string).length end - # UNTESTED def Functions::normalize_space( string=nil ) string = string(@@context[:node]) if string.nil? if string.kind_of? Array - string.collect{|x| string.to_s.strip.gsub(/\s+/um, ' ') if string} + string.collect{|x| x.to_s.strip.gsub(/\s+/um, ' ') if x} else string.to_s.strip.gsub(/\s+/um, ' ') end diff --git a/lib/rexml/namespace.rb b/lib/rexml/namespace.rb index 924edf95..2e67252a 100644 --- a/lib/rexml/namespace.rb +++ b/lib/rexml/namespace.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative 'xmltokens' @@ -10,13 +10,17 @@ module Namespace # The expanded name of the object, valid if name is set attr_accessor :prefix include XMLTokens + NAME_WITHOUT_NAMESPACE = /\A#{NCNAME_STR}\z/ NAMESPLIT = /^(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})/u # Sets the name and the expanded name def name=( name ) @expanded_name = name - case name - when NAMESPLIT + if name.match?(NAME_WITHOUT_NAMESPACE) + @prefix = "" + @namespace = "" + @name = name + elsif name =~ NAMESPLIT if $1 @prefix = $1 else @@ -24,7 +28,7 @@ def name=( name ) @namespace = "" end @name = $2 - when "" + elsif name == "" @prefix = nil @namespace = nil @name = nil diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index 081caba6..c771db70 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -52,10 +52,14 @@ def parent? # Visit all subnodes of +self+ recursively def each_recursive(&block) # :yields: node - self.elements.each {|node| - block.call(node) - node.each_recursive(&block) - } + stack = [] + each { |child| stack.unshift child if child.node_type == :element } + until stack.empty? + child = stack.pop + yield child + n = stack.size + child.each { |grandchild| stack.insert n, grandchild if grandchild.node_type == :element } + end end # Find (and return) first subnode (recursively) for which the block diff --git a/lib/rexml/parseexception.rb b/lib/rexml/parseexception.rb index 7b16cd1a..e57d05fd 100644 --- a/lib/rexml/parseexception.rb +++ b/lib/rexml/parseexception.rb @@ -29,6 +29,7 @@ def to_s err << "\nLine: #{line}\n" err << "Position: #{position}\n" err << "Last 80 unconsumed characters:\n" + err.force_encoding("ASCII-8BIT") err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ') end diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 305b1207..44dc6580 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative '../parseexception' require_relative '../undefinednamespaceexception' require_relative '../source' @@ -7,6 +7,17 @@ module REXML module Parsers + if StringScanner::Version < "3.0.8" + module StringScannerCaptures + refine StringScanner do + def captures + values_at(*(1...size)) + end + end + end + using StringScannerCaptures + end + # = Using the Pull Parser # This API is experimental, and subject to change. # parser = PullParser.new( "texttxet" ) @@ -96,7 +107,7 @@ class BaseParser ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" PEDECL = "" GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um NOTATIONDECL_START = /\A\s* [/'/, "'", "'", /'/] } + module Private + TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um + CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um + ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um + NAME_PATTERN = /#{NAME}/um + GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" + PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" + ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um + CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/ + CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + DEFAULT_ENTITIES_PATTERNS = {} + default_entities = ['gt', 'lt', 'quot', 'apos', 'amp'] + default_entities.each do |term| + DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ + end + end + private_constant :Private + def initialize( source ) self.stream = source @listeners = [] + @prefixes = Set.new + @entity_expansion_count = 0 end def add_listener( listener ) @@ -122,10 +153,12 @@ def add_listener( listener ) end attr_reader :source + attr_reader :entity_expansion_count def stream=( source ) @source = SourceFactory.create_from( source ) @closed = nil + @have_root = false @document_status = nil @tags = [] @stack = [] @@ -180,6 +213,8 @@ def peek depth=0 # Returns the next event. This is a +PullEvent+ object. def pull + @source.drop_parsed_content + pull_event.tap do |event| @listeners.each do |listener| listener.receive event @@ -192,236 +227,269 @@ def pull_event x, @closed = @closed, nil return [ :end_element, x ] end - return [ :end_document ] if empty? + if empty? + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: unclosed", @source) + end + return [ :end_document ] + end return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" + + @source.ensure_buffer if @document_status == nil - word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) - word = word[1] unless word.nil? - #STDERR.puts "WORD = #{word.inspect}" - case word - when COMMENT_START - return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] - when XMLDECL_START - #STDERR.puts "XMLDECL" - results = @source.match( XMLDECL_PATTERN, true )[1] - version = VERSION.match( results ) - version = version[1] unless version.nil? - encoding = ENCODING.match(results) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding - end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" - end - standalone = STANDALONE.match(results) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] - when INSTRUCTION_START + start_position = @source.position + if @source.match("/um, true) - id = [nil, nil, nil] - @document_status = :after_doctype - else - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: false) - if id[0] == "SYSTEM" - # For backward compatibility - id[1], id[2] = id[2], nil + elsif @source.match("/um, true) + if md.nil? + raise REXML::ParseException.new("Unclosed comment", @source) end - if @source.match(/\A\s*\[/um, true) + if /--|-\z/.match?(md[1]) + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] + elsif @source.match("DOCTYPE", true) + base_error_message = "Malformed DOCTYPE" + unless @source.match(/\s+/um, true) + if @source.match(">") + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid name" + end + @source.position = start_position + raise REXML::ParseException.new(message, @source) + end + @nsstack.unshift(Set.new) + name = parse_name(base_error_message) + if @source.match(/\s*\[/um, true) + id = [nil, nil, nil] @document_status = :in_doctype - elsif @source.match(/\A\s*>/um, true) + elsif @source.match(/\s*>/um, true) + id = [nil, nil, nil] @document_status = :after_doctype + @source.ensure_buffer else - message = "#{base_error_message}: garbage after external ID" - raise REXML::ParseException.new(message, @source) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: false) + if id[0] == "SYSTEM" + # For backward compatibility + id[1], id[2] = id[2], nil + end + if @source.match(/\s*\[/um, true) + @document_status = :in_doctype + elsif @source.match(/\s*>/um, true) + @document_status = :after_doctype + @source.ensure_buffer + else + message = "#{base_error_message}: garbage after external ID" + raise REXML::ParseException.new(message, @source) + end end - end - args = [:start_doctype, name, *id] - if @document_status == :after_doctype - @source.match(/\A\s*/um, true) - @stack << [ :end_doctype ] - end - return args - when /\A\s+/ - else - @document_status = :after_doctype - if @source.encoding == "UTF-8" - @source.buffer.force_encoding(::Encoding::UTF_8) + args = [:start_doctype, name, *id] + if @document_status == :after_doctype + @source.match(/\s*/um, true) + @stack << [ :end_doctype ] + end + return args + else + message = "Invalid XML" + raise REXML::ParseException.new(message, @source) end end end if @document_status == :in_doctype - md = @source.match(/\A\s*(.*?>)/um) - case md[1] - when SYSTEMENTITY - match = @source.match( SYSTEMENTITY, true )[1] - return [ :externalentity, match ] - - when ELEMENTDECL_START - return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] - - when ENTITY_START - match = @source.match( ENTITYDECL, true ).to_a.compact - match[0] = :entitydecl - ref = false - if match[1] == '%' - ref = true - match.delete_at 1 - end - # Now we have to sort out what kind of entity reference this is - if match[2] == 'SYSTEM' - # External reference - match[3] = match[3][1..-2] # PUBID - match.delete_at(4) if match.size > 4 # Chop out NDATA decl - # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] - elsif match[2] == 'PUBLIC' - # External reference - match[3] = match[3][1..-2] # PUBID - match[4] = match[4][1..-2] # HREF - match.delete_at(5) if match.size > 5 # Chop out NDATA decl - # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] - else - match[2] = match[2][1..-2] - match.pop if match.size == 4 - # match is [ :entity, name, value ] - end - match << '%' if ref - return match - when ATTLISTDECL_START - md = @source.match( ATTLISTDECL_PATTERN, true ) - raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? - element = md[1] - contents = md[0] - - pairs = {} - values = md[0].scan( ATTDEF_RE ) - values.each do |attdef| - unless attdef[3] == "#IMPLIED" - attdef.compact! - val = attdef[3] - val = attdef[4] if val == "#FIXED " - pairs[attdef[0]] = val - if attdef[0] =~ /^xmlns:(.*)/ - @nsstack[0] << $1 - end + @source.match(/\s*/um, true) # skip spaces + start_position = @source.position + if @source.match("/um, true) + raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? + return [ :elementdecl, "/um) - message = "#{base_error_message}: name is missing" + match = [:entitydecl, *match_data.captures.compact] + ref = false + if match[1] == '%' + ref = true + match.delete_at 1 + end + # Now we have to sort out what kind of entity reference this is + if match[2] == 'SYSTEM' + # External reference + match[3] = match[3][1..-2] # PUBID + match.delete_at(4) if match.size > 4 # Chop out NDATA decl + # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] + elsif match[2] == 'PUBLIC' + # External reference + match[3] = match[3][1..-2] # PUBID + match[4] = match[4][1..-2] # HREF + match.delete_at(5) if match.size > 5 # Chop out NDATA decl + # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] else - message = "#{base_error_message}: invalid declaration name" + match[2] = match[2][1..-2] + match.pop if match.size == 4 + # match is [ :entity, name, value ] end - raise REXML::ParseException.new(message, @source) - end - name = parse_name(base_error_message) - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: true) - unless @source.match(/\A\s*>/um, true) - message = "#{base_error_message}: garbage before end >" - raise REXML::ParseException.new(message, @source) + match << '%' if ref + return match + elsif @source.match("ATTLIST", true) + md = @source.match(Private::ATTLISTDECL_END, true) + raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? + element = md[1] + contents = md[0] + + pairs = {} + values = md[0].strip.scan( ATTDEF_RE ) + values.each do |attdef| + unless attdef[3] == "#IMPLIED" + attdef.compact! + val = attdef[3] + val = attdef[4] if val == "#FIXED " + pairs[attdef[0]] = val + if attdef[0] =~ /^xmlns:(.*)/ + @nsstack[0] << $1 + end + end + end + return [ :attlistdecl, element, pairs, contents ] + elsif @source.match("NOTATION", true) + base_error_message = "Malformed notation declaration" + unless @source.match(/\s+/um, true) + if @source.match(">") + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid name" + end + @source.position = start_position + raise REXML::ParseException.new(message, @source) + end + name = parse_name(base_error_message) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: true) + unless @source.match(/\s*>/um, true) + message = "#{base_error_message}: garbage before end >" + raise REXML::ParseException.new(message, @source) + end + return [:notationdecl, name, *id] + elsif md = @source.match(/--(.*?)-->/um, true) + case md[1] + when /--/, /-\z/ + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] if md end - return [:notationdecl, name, *id] - when DOCTYPE_END + elsif match = @source.match(/(%.*?;)\s*/um, true) + return [ :externalentity, match[1] ] + elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype - @source.match( DOCTYPE_END, true ) return [ :end_doctype ] end + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source) + end end if @document_status == :after_doctype - @source.match(/\A\s*/um, true) + @source.match(/\s*/um, true) end begin - @source.read if @source.buffer.size<2 - if @source.buffer[0] == ?< - if @source.buffer[1] == ?/ + start_position = @source.position + if @source.match("<", true) + # :text's read_until may remain only "<" in buffer. In the + # case, buffer is empty here. So we need to fill buffer + # here explicitly. + @source.ensure_buffer + if @source.match("/", true) @nsstack.shift last_tag = @tags.pop - md = @source.match( CLOSE_MATCH, true ) + md = @source.match(Private::CLOSE_PATTERN, true) if md and !last_tag message = "Unexpected top-level end tag (got '#{md[1]}')" raise REXML::ParseException.new(message, @source) end if md.nil? or last_tag != md[1] message = "Missing end tag for '#{last_tag}'" - message << " (got '#{md[1]}')" if md + message += " (got '#{md[1]}')" if md + @source.position = start_position if md.nil? raise REXML::ParseException.new(message, @source) end return [ :end_element, last_tag ] - elsif @source.buffer[1] == ?! - md = @source.match(/\A(\s*[^>]*>)/um) + elsif @source.match("!", true) + md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][2] == ?- - md = @source.match( COMMENT_PATTERN, true ) + if md[0][0] == ?- + md = @source.match(/--(.*?)-->/um, true) - case md[1] - when /--/, /-\z/ + if md.nil? || /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) end - return [ :comment, md[1] ] if md + return [ :comment, md[1] ] else - md = @source.match( CDATA_PATTERN, true ) + md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) - elsif @source.buffer[1] == ?? + elsif @source.match("?", true) return process_instruction else # Get the next tag - md = @source.match(TAG_MATCH, true) + md = @source.match(Private::TAG_PATTERN, true) unless md + @source.position = start_position raise REXML::ParseException.new("malformed XML: missing tag start", @source) end + tag = md[1] @document_status = :in_element - prefixes = Set.new - prefixes << md[2] if md[2] + @prefixes.clear + @prefixes << md[2] if md[2] @nsstack.unshift(curr_ns=Set.new) - attributes, closed = parse_attributes(prefixes, curr_ns) + attributes, closed = parse_attributes(@prefixes, curr_ns) # Verify that all of the prefixes have been defined - for prefix in prefixes + for prefix in @prefixes unless @nsstack.find{|k| k.member?(prefix)} raise UndefinedNamespaceException.new(prefix,@source,self) end end if closed - @closed = md[1] + @closed = tag @nsstack.shift else - @tags.push( md[1] ) + if @tags.empty? and @have_root + raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source) + end + @tags.push( tag ) end - return [ :start_element, md[1], attributes ] + @have_root = true + return [ :start_element, tag, attributes ] end else - md = @source.match( TEXT_PATTERN, true ) - if md[0].length == 0 - @source.match( /(\s+)/, true ) + text = @source.read_until("<") + if text.chomp!("<") + @source.position -= "<".bytesize + end + if @tags.empty? + unless /\A\s*\z/.match?(text) + if @have_root + raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) + else + raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source) + end + end + return pull_event if @have_root end - #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 - #return [ :text, "" ] if md[0].length == 0 - # unnormalized = Text::unnormalize( md[1], self ) - # return PullEvent.new( :text, md[1], unnormalized ) - return [ :text, md[1] ] + return [ :text, text ] end rescue REXML::UndefinedNamespaceException raise @@ -438,7 +506,9 @@ def pull_event def entity( reference, entities ) value = nil value = entities[ reference ] if entities - if not value + if value + record_entity_expansion + else value = DEFAULT_ENTITIES[ reference ] value = value[2] if value end @@ -463,35 +533,51 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.clone - rv.gsub!( /\r\n?/, "\n" ) + if string.include?("\r") + rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) + else + rv = string.dup + end matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 - rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { + rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') } matches.collect!{|x|x[0]}.compact! if matches.size > 0 + sum = 0 matches.each do |entity_reference| unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value - re = /&#{entity_reference};/ + re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) + sum += rv.bytesize + if sum > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" + end else er = DEFAULT_ENTITIES[entity_reference] rv.gsub!( er[0], er[2] ) if er end end end - rv.gsub!( /&/, '&' ) + rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' ) end rv end private + + def record_entity_expansion + @entity_expansion_count += 1 + if @entity_expansion_count > Security.entity_expansion_limit + raise "number of entity expansions exceeded, processing aborted." + end + end + def need_source_encoding_update?(xml_declaration_encoding) return false if xml_declaration_encoding.nil? return false if /\AUTF-16\z/i =~ xml_declaration_encoding @@ -499,16 +585,16 @@ def need_source_encoding_update?(xml_declaration_encoding) end def parse_name(base_error_message) - md = @source.match(/\A\s*#{NAME}/um, true) + md = @source.match(Private::NAME_PATTERN, true) unless md - if @source.match(/\A\s*\S/um) + if @source.match(/\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" end raise REXML::ParseException.new(message, @source) end - md[1] + md[0] end def parse_id(base_error_message, @@ -578,96 +664,99 @@ def parse_id_invalid_details(accept_external_id:, end def process_instruction - match_data = @source.match(INSTRUCTION_PATTERN, true) - unless match_data - message = "Invalid processing instruction node" - raise REXML::ParseException.new(message, @source) + name = parse_name("Malformed XML: Invalid processing instruction node") + if @source.match(/\s+/um, true) + match_data = @source.match(/(.*?)\?>/um, true) + unless match_data + raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + end + content = match_data[1] + else + content = nil + unless @source.match("?>", true) + raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + end + end + if name == "xml" + if @document_status + raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) + end + version = VERSION.match(content) + version = version[1] unless version.nil? + encoding = ENCODING.match(content) + encoding = encoding[1] unless encoding.nil? + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding + encoding = "UTF-16" + end + standalone = STANDALONE.match(content) + standalone = standalone[1] unless standalone.nil? + return [ :xmldecl, version, encoding, standalone ] end - [:processing_instruction, match_data[1], match_data[2]] + [:processing_instruction, name, content] end def parse_attributes(prefixes, curr_ns) attributes = {} closed = false - match_data = @source.match(/^(.*?)(\/)?>/um, true) - if match_data.nil? - message = "Start tag isn't ended" - raise REXML::ParseException.new(message, @source) - end - - raw_attributes = match_data[1] - closed = !match_data[2].nil? - return attributes, closed if raw_attributes.nil? - return attributes, closed if raw_attributes.empty? - - scanner = StringScanner.new(raw_attributes) - until scanner.eos? - if scanner.scan(/\s+/) - break if scanner.eos? - end - - pos = scanner.pos - loop do - break if scanner.scan(ATTRIBUTE_PATTERN) - unless scanner.scan(QNAME) - message = "Invalid attribute name: <#{scanner.rest}>" - raise REXML::ParseException.new(message, @source) - end - name = scanner[0] - unless scanner.scan(/\s*=\s*/um) + while true + if @source.match(">", true) + return attributes, closed + elsif @source.match("/>", true) + closed = true + return attributes, closed + elsif match = @source.match(QNAME, true) + name = match[1] + prefix = match[2] + local_part = match[3] + + unless @source.match(/\s*=\s*/um, true) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end - quote = scanner.scan(/['"]/) - unless quote + unless match = @source.match(/(['"])/, true) message = "Missing attribute value start quote: <#{name}>" raise REXML::ParseException.new(message, @source) end - unless scanner.scan(/.*#{Regexp.escape(quote)}/um) - match_data = @source.match(/^(.*?)(\/)?>/um, true) - if match_data - scanner << "/" if closed - scanner << ">" - scanner << match_data[1] - scanner.pos = pos - closed = !match_data[2].nil? - next - end - message = - "Missing attribute value end quote: <#{name}>: <#{quote}>" + quote = match[1] + start_position = @source.position + value = @source.read_until(quote) + unless value.chomp!(quote) + @source.position = start_position + message = "Missing attribute value end quote: <#{name}>: <#{quote}>" raise REXML::ParseException.new(message, @source) end - end - name = scanner[1] - prefix = scanner[2] - local_part = scanner[3] - # quote = scanner[4] - value = scanner[5] - if prefix == "xmlns" - if local_part == "xml" - if value != "http://www.w3.org/XML/1998/namespace" - msg = "The 'xml' prefix must not be bound to any other namespace "+ + @source.match(/\s*/um, true) + if prefix == "xmlns" + if local_part == "xml" + if value != "http://www.w3.org/XML/1998/namespace" + msg = "The 'xml' prefix must not be bound to any other namespace "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" + raise REXML::ParseException.new( msg, @source, self ) + end + elsif local_part == "xmlns" + msg = "The 'xmlns' prefix must not be declared "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self ) + raise REXML::ParseException.new( msg, @source, self) end - elsif local_part == "xmlns" - msg = "The 'xmlns' prefix must not be declared "+ - "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self) + curr_ns << local_part + elsif prefix + prefixes << prefix unless prefix == "xml" end - curr_ns << local_part - elsif prefix - prefixes << prefix unless prefix == "xml" - end - if attributes.has_key?(name) - msg = "Duplicate attribute #{name.inspect}" - raise REXML::ParseException.new(msg, @source, self) - end + if attributes[name] + msg = "Duplicate attribute #{name.inspect}" + raise REXML::ParseException.new(msg, @source, self) + end - attributes[name] = value + attributes[name] = value + else + message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>" + raise REXML::ParseException.new(message, @source) + end end - return attributes, closed end end end diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb index f8b232a2..36b45953 100644 --- a/lib/rexml/parsers/pullparser.rb +++ b/lib/rexml/parsers/pullparser.rb @@ -47,6 +47,10 @@ def add_listener( listener ) @listeners << listener end + def entity_expansion_count + @parser.entity_expansion_count + end + def each while has_next? yield self.pull diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 6a24ce22..cec9d2fc 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -22,6 +22,10 @@ def source @parser.source end + def entity_expansion_count + @parser.entity_expansion_count + end + def add_listener( listener ) @parser.add_listener( listener ) end @@ -157,25 +161,8 @@ def parse end end when :text - #normalized = @parser.normalize( event[1] ) - #handle( :characters, normalized ) - copy = event[1].clone - - esub = proc { |match| - if @entities.has_key?($1) - @entities[$1].gsub(Text::REFERENCE, &esub) - else - match - end - } - - copy.gsub!( Text::REFERENCE, &esub ) - copy.gsub!( Text::NUMERICENTITY ) {|m| - m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') - } - handle( :characters, copy ) + unnormalized = @parser.unnormalize( event[1], @entities ) + handle( :characters, unnormalized ) when :entitydecl handle_entitydecl( event ) when :processing_instruction, :comment, :attlistdecl, diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 9e0eb0b3..fa3ac496 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -36,8 +36,8 @@ def parse @listener.tag_end( event[1] ) @tag_stack.pop when :text - normalized = @parser.unnormalize( event[1] ) - @listener.text( normalized ) + unnormalized = @parser.unnormalize( event[1] ) + @listener.text( unnormalized ) when :processing_instruction @listener.instruction( *event[1,2] ) when :start_doctype diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index bf9a4254..0cb6f7cc 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -16,7 +16,6 @@ def add_listener( listener ) def parse tag_stack = [] - in_doctype = false entities = nil begin while true @@ -39,17 +38,15 @@ def parse tag_stack.pop @build_context = @build_context.parent when :text - if not in_doctype - if @build_context[-1].instance_of? Text - @build_context[-1] << event[1] - else - @build_context.add( - Text.new(event[1], @build_context.whitespace, nil, true) - ) unless ( - @build_context.ignore_whitespace_nodes and - event[1].strip.size==0 - ) - end + if @build_context[-1].instance_of? Text + @build_context[-1] << event[1] + else + @build_context.add( + Text.new(event[1], @build_context.whitespace, nil, true) + ) unless ( + @build_context.ignore_whitespace_nodes and + event[1].strip.size==0 + ) end when :comment c = Comment.new( event[1] ) @@ -60,14 +57,12 @@ def parse when :processing_instruction @build_context.add( Instruction.new( event[1], event[2] ) ) when :end_doctype - in_doctype = false entities.each { |k,v| entities[k] = @build_context.entities[k].value } @build_context = @build_context.parent when :start_doctype doctype = DocType.new( event[1..-1], @build_context ) @build_context = doctype entities = {} - in_doctype = true when :attlistdecl n = AttlistDecl.new( event[1..-1] ) @build_context.add( n ) diff --git a/lib/rexml/parsers/xpathparser.rb b/lib/rexml/parsers/xpathparser.rb index d92678fe..bd3b6856 100644 --- a/lib/rexml/parsers/xpathparser.rb +++ b/lib/rexml/parsers/xpathparser.rb @@ -1,4 +1,5 @@ # frozen_string_literal: false + require_relative '../namespace' require_relative '../xmltokens' @@ -38,108 +39,143 @@ def predicate path parsed end - def abbreviate( path ) - path = path.kind_of?(String) ? parse( path ) : path - string = "" - document = false - while path.size > 0 - op = path.shift + def abbreviate(path_or_parsed) + if path_or_parsed.kind_of?(String) + parsed = parse(path_or_parsed) + else + parsed = path_or_parsed + end + components = [] + component = nil + while parsed.size > 0 + op = parsed.shift case op when :node + component << "node()" when :attribute - string << "/" if string.size > 0 - string << "@" + component = "@" + components << component when :child - string << "/" if string.size > 0 + component = "" + components << component when :descendant_or_self - string << "/" + next_op = parsed[0] + if next_op == :node + parsed.shift + component = "" + components << component + else + component = "descendant-or-self::" + components << component + end when :self - string << "." + next_op = parsed[0] + if next_op == :node + parsed.shift + components << "." + else + component = "self::" + components << component + end when :parent - string << ".." + next_op = parsed[0] + if next_op == :node + parsed.shift + components << ".." + else + component = "parent::" + components << component + end when :any - string << "*" + component << "*" when :text - string << "text()" + component << "text()" when :following, :following_sibling, :ancestor, :ancestor_or_self, :descendant, :namespace, :preceding, :preceding_sibling - string << "/" unless string.size == 0 - string << op.to_s.tr("_", "-") - string << "::" + component = op.to_s.tr("_", "-") << "::" + components << component when :qname - prefix = path.shift - name = path.shift - string << prefix+":" if prefix.size > 0 - string << name + prefix = parsed.shift + name = parsed.shift + component << prefix+":" if prefix.size > 0 + component << name when :predicate - string << '[' - string << predicate_to_string( path.shift ) {|x| abbreviate( x ) } - string << ']' + component << '[' + component << predicate_to_path(parsed.shift) {|x| abbreviate(x)} + component << ']' when :document - document = true + components << "" when :function - string << path.shift - string << "( " - string << predicate_to_string( path.shift[0] ) {|x| abbreviate( x )} - string << " )" + component << parsed.shift + component << "( " + component << predicate_to_path(parsed.shift[0]) {|x| abbreviate(x)} + component << " )" when :literal - string << %Q{ "#{path.shift}" } + component << quote_literal(parsed.shift) else - string << "/" unless string.size == 0 - string << "UNKNOWN(" - string << op.inspect - string << ")" + component << "UNKNOWN(" + component << op.inspect + component << ")" end end - string = "/"+string if document - return string + case components + when [""] + "/" + when ["", ""] + "//" + else + components.join("/") + end end - def expand( path ) - path = path.kind_of?(String) ? parse( path ) : path - string = "" + def expand(path_or_parsed) + if path_or_parsed.kind_of?(String) + parsed = parse(path_or_parsed) + else + parsed = path_or_parsed + end + path = "" document = false - while path.size > 0 - op = path.shift + while parsed.size > 0 + op = parsed.shift case op when :node - string << "node()" + path << "node()" when :attribute, :child, :following, :following_sibling, :ancestor, :ancestor_or_self, :descendant, :descendant_or_self, :namespace, :preceding, :preceding_sibling, :self, :parent - string << "/" unless string.size == 0 - string << op.to_s.tr("_", "-") - string << "::" + path << "/" unless path.size == 0 + path << op.to_s.tr("_", "-") + path << "::" when :any - string << "*" + path << "*" when :qname - prefix = path.shift - name = path.shift - string << prefix+":" if prefix.size > 0 - string << name + prefix = parsed.shift + name = parsed.shift + path << prefix+":" if prefix.size > 0 + path << name when :predicate - string << '[' - string << predicate_to_string( path.shift ) { |x| expand(x) } - string << ']' + path << '[' + path << predicate_to_path( parsed.shift ) { |x| expand(x) } + path << ']' when :document document = true else - string << "/" unless string.size == 0 - string << "UNKNOWN(" - string << op.inspect - string << ")" + path << "UNKNOWN(" + path << op.inspect + path << ")" end end - string = "/"+string if document - return string + path = "/"+path if document + path end - def predicate_to_string( path, &block ) - string = "" - case path[0] + def predicate_to_path(parsed, &block) + path = "" + case parsed[0] when :and, :or, :mult, :plus, :minus, :neq, :eq, :lt, :gt, :lteq, :gteq, :div, :mod, :union - op = path.shift + op = parsed.shift case op when :eq op = "=" @@ -156,36 +192,50 @@ def predicate_to_string( path, &block ) when :union op = "|" end - left = predicate_to_string( path.shift, &block ) - right = predicate_to_string( path.shift, &block ) - string << " " - string << left - string << " " - string << op.to_s - string << " " - string << right - string << " " + left = predicate_to_path( parsed.shift, &block ) + right = predicate_to_path( parsed.shift, &block ) + path << left + path << " " + path << op.to_s + path << " " + path << right when :function - path.shift - name = path.shift - string << name - string << "( " - string << predicate_to_string( path.shift, &block ) - string << " )" + parsed.shift + name = parsed.shift + path << name + path << "(" + parsed.shift.each_with_index do |argument, i| + path << ", " if i > 0 + path << predicate_to_path(argument, &block) + end + path << ")" when :literal - path.shift - string << " " - string << path.shift.inspect - string << " " + parsed.shift + path << quote_literal(parsed.shift) else - string << " " - string << yield( path ) - string << " " + path << yield( parsed ) end - return string.squeeze(" ") + return path.squeeze(" ") end + # For backward compatibility + alias_method :preciate_to_string, :predicate_to_path private + def quote_literal( literal ) + case literal + when String + # XPath 1.0 does not support escape characters. + # Assumes literal does not contain both single and double quotes. + if literal.include?("'") + "\"#{literal}\"" + else + "'#{literal}'" + end + else + literal.inspect + end + end + #LocationPath # | RelativeLocationPath # | '/' RelativeLocationPath? diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 8a01f0e1..39e92a57 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -26,10 +26,12 @@ # - REXML::Document. # - REXML::Element. # +# There's also an {REXML tutorial}[doc/rexml/tutorial_rdoc.html]. +# module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.2.5" + VERSION = "3.3.3" REVISION = "" Copyright = COPYRIGHT diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 90b370b9..ff887fc0 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -1,8 +1,28 @@ # coding: US-ASCII # frozen_string_literal: false + +require "strscan" + require_relative 'encoding' module REXML + if StringScanner::Version < "1.0.0" + module StringScannerCheckScanString + refine StringScanner do + def check(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end + + def scan(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end + end + end + using StringScannerCheckScanString + end + # Generates Source-s. USE THIS CLASS. class SourceFactory # Generates a Source object @@ -30,18 +50,27 @@ def SourceFactory::create_from(arg) # objects and provides consumption of text class Source include Encoding - # The current buffer (what we're going to read next) - attr_reader :buffer # The line number of the last consumed text attr_reader :line attr_reader :encoding + module Private + SCANNER_RESET_SIZE = 100000 + PRE_DEFINED_TERM_PATTERNS = {} + pre_defined_terms = ["'", '"', "<"] + pre_defined_terms.each do |term| + PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ + end + end + private_constant :Private + # Constructor # @param arg must be a String, and should be a valid XML document # @param encoding if non-null, sets the encoding of the source to this # value, overriding all encoding detection def initialize(arg, encoding=nil) - @orig = @buffer = arg + @orig = arg + @scanner = StringScanner.new(@orig) if encoding self.encoding = encoding else @@ -50,6 +79,20 @@ def initialize(arg, encoding=nil) @line = 0 end + # The current buffer (what we're going to read next) + def buffer + @scanner.rest + end + + def drop_parsed_content + if @scanner.pos > Private::SCANNER_RESET_SIZE + @scanner.string = @scanner.rest + end + end + + def buffer_encoding=(encoding) + @scanner.string.force_encoding(encoding) + end # Inherited from Encoding # Overridden to support optimized en/decoding @@ -58,98 +101,78 @@ def encoding=(enc) encoding_updated end - # Scans the source for a given pattern. Note, that this is not your - # usual scan() method. For one thing, the pattern argument has some - # requirements; for another, the source can be consumed. You can easily - # confuse this method. Originally, the patterns were easier - # to construct and this method more robust, because this method - # generated search regexps on the fly; however, this was - # computationally expensive and slowed down the entire REXML package - # considerably, since this is by far the most commonly called method. - # @param pattern must be a Regexp, and must be in the form of - # /^\s*(#{your pattern, with no groups})(.*)/. The first group - # will be returned; the second group is used if the consume flag is - # set. - # @param consume if true, the pattern returned will be consumed, leaving - # everything after it in the Source. - # @return the pattern, if found, or nil if the Source is empty or the - # pattern is not found. - def scan(pattern, cons=false) - return nil if @buffer.nil? - rv = @buffer.scan(pattern) - @buffer = $' if cons and rv.size>0 - rv + def read(term = nil) end - def read + def read_until(term) + pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ + data = @scanner.scan_until(pattern) + unless data + data = @scanner.rest + @scanner.pos = @scanner.string.bytesize + end + data end - def consume( pattern ) - @buffer = $' if pattern.match( @buffer ) + def ensure_buffer end - def match_to( char, pattern ) - return pattern.match(@buffer) + def match(pattern, cons=false) + if cons + @scanner.scan(pattern).nil? ? nil : @scanner + else + @scanner.check(pattern).nil? ? nil : @scanner + end end - def match_to_consume( char, pattern ) - md = pattern.match(@buffer) - @buffer = $' - return md + def position + @scanner.pos end - def match(pattern, cons=false) - md = pattern.match(@buffer) - @buffer = $' if cons and md - return md + def position=(pos) + @scanner.pos = pos end # @return true if the Source is exhausted def empty? - @buffer == "" - end - - def position - @orig.index( @buffer ) + @scanner.eos? end # @return the current line in the source def current_line lines = @orig.split - res = lines.grep @buffer[0..30] + res = lines.grep @scanner.rest[0..30] res = res[-1] if res.kind_of? Array lines.index( res ) if res end private + def detect_encoding - buffer_encoding = @buffer.encoding + scanner_encoding = @scanner.rest.encoding detected_encoding = "UTF-8" begin - @buffer.force_encoding("ASCII-8BIT") - if @buffer[0, 2] == "\xfe\xff" - @buffer[0, 2] = "" + @scanner.string.force_encoding("ASCII-8BIT") + if @scanner.scan(/\xfe\xff/n) detected_encoding = "UTF-16BE" - elsif @buffer[0, 2] == "\xff\xfe" - @buffer[0, 2] = "" + elsif @scanner.scan(/\xff\xfe/n) detected_encoding = "UTF-16LE" - elsif @buffer[0, 3] == "\xef\xbb\xbf" - @buffer[0, 3] = "" + elsif @scanner.scan(/\xef\xbb\xbf/n) detected_encoding = "UTF-8" end ensure - @buffer.force_encoding(buffer_encoding) + @scanner.string.force_encoding(scanner_encoding) end self.encoding = detected_encoding end def encoding_updated if @encoding != 'UTF-8' - @buffer = decode(@buffer) + @scanner.string = decode(@scanner.rest) @to_utf = true else @to_utf = false - @buffer.force_encoding ::Encoding::UTF_8 + @scanner.string.force_encoding(::Encoding::UTF_8) end end end @@ -172,7 +195,7 @@ def initialize(arg, block_size=500, encoding=nil) end if !@to_utf and - @buffer.respond_to?(:force_encoding) and + @orig.respond_to?(:force_encoding) and @source.respond_to?(:external_encoding) and @source.external_encoding != ::Encoding::UTF_8 @force_utf8 = true @@ -181,65 +204,72 @@ def initialize(arg, block_size=500, encoding=nil) end end - def scan(pattern, cons=false) - rv = super - # You'll notice that this next section is very similar to the same - # section in match(), but just a liiittle different. This is - # because it is a touch faster to do it this way with scan() - # than the way match() does it; enough faster to warrant duplicating - # some code - if rv.size == 0 - until @buffer =~ pattern or @source.nil? - begin - @buffer << readline - rescue Iconv::IllegalSequence - raise - rescue - @source = nil + def read(term = nil, min_bytes = 1) + term = encode(term) if term + begin + str = readline(term) + @scanner << str + read_bytes = str.bytesize + begin + while read_bytes < min_bytes + str = readline(term) + @scanner << str + read_bytes += str.bytesize end + rescue IOError end - rv = super + true + rescue Exception, NameError + @source = nil + false end - rv.taint if RUBY_VERSION < '2.7' - rv end - def read - begin - @buffer << readline - rescue Exception, NameError - @source = nil + def read_until(term) + pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ + term = encode(term) + until str = @scanner.scan_until(pattern) + break if @source.nil? + break if @source.eof? + @scanner << readline(term) + end + if str + read if @scanner.eos? and !@source.eof? + str + else + rest = @scanner.rest + @scanner.pos = @scanner.string.bytesize + rest end end - def consume( pattern ) - match( pattern, true ) + def ensure_buffer + read if @scanner.eos? && @source end def match( pattern, cons=false ) - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - while !rv and @source - begin - @buffer << readline - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - rescue - @source = nil + # To avoid performance issue, we need to increase bytes to read per scan + min_bytes = 1 + while true + if cons + md = @scanner.scan(pattern) + else + md = @scanner.check(pattern) end + break if md + return nil if pattern.is_a?(String) + return nil if @source.nil? + return nil unless read(nil, min_bytes) + min_bytes *= 2 end - rv.taint if RUBY_VERSION < '2.7' - rv + + md.nil? ? nil : @scanner end def empty? super and ( @source.nil? || @source.eof? ) end - def position - @er_source.pos rescue 0 - end - # @return the current line in the source def current_line begin @@ -263,8 +293,8 @@ def current_line end private - def readline - str = @source.readline(@line_break) + def readline(term = nil) + str = @source.readline(term || @line_break) if @pending_buffer if str.nil? str = @pending_buffer @@ -290,7 +320,7 @@ def encoding_updated @source.set_encoding(@encoding, @encoding) end @line_break = encode(">") - @pending_buffer, @buffer = @buffer, "" + @pending_buffer, @scanner.string = @scanner.rest, "" @pending_buffer.force_encoding(@encoding) super end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 050b09c9..7e0befe9 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative 'security' require_relative 'entity' require_relative 'doctype' @@ -131,7 +131,7 @@ def parent= parent def Text.check string, pattern, doctype # illegal anywhere - if string !~ VALID_XML_CHARS + if !string.match?(VALID_XML_CHARS) if String.method_defined? :encode string.chars.each do |c| case c.ord @@ -151,25 +151,45 @@ def Text.check string, pattern, doctype end end - # context sensitive - string.scan(pattern) do - if $1[-1] != ?; - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" - elsif $1[0] == ?& - if $5 and $5[0] == ?# - case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) - when *VALID_CHAR + pos = 0 + while (index = string.index(/<|&/, pos)) + if string[index] == "<" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + unless (end_index = string.index(/[^\s];/, index + 1)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + value = string[(index + 1)..end_index] + if /\s/.match?(value) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + if value[0] == "#" + character_reference = value[1..-1] + + unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference)) + if character_reference[0] == "x" || character_reference[-1] == "x" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" else - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" end - # FIXME: below can't work but this needs API change. - # elsif @parent and $3 and !SUBSTITUTES.include?($1) - # if !doctype or !doctype.entities.has_key?($3) - # raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" - # end end + + case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" + end + elsif !(/\A#{Entity::NAME}\z/um.match?(value)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" end + + pos = end_index + 1 end + + string end def node_type @@ -371,7 +391,7 @@ def Text::normalize( input, doctype=nil, entity_filter=nil ) copy = input.to_s # Doing it like this rather than in a loop improves the speed #copy = copy.gsub( EREFERENCE, '&' ) - copy = copy.gsub( "&", "&" ) + copy = copy.gsub( "&", "&" ) if copy.include?("&") if doctype # Replace all ampersands that aren't part of an entity doctype.entities.each_value do |entity| @@ -382,7 +402,9 @@ def Text::normalize( input, doctype=nil, entity_filter=nil ) else # Replace all ampersands that aren't part of an entity DocType::DEFAULT_ENTITIES.each_value do |entity| - copy = copy.gsub(entity.value, "&#{entity.name};" ) + if copy.include?(entity.value) + copy = copy.gsub(entity.value, "&#{entity.name};" ) + end end end copy diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index d8b88e7a..5eb1e5a9 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -590,6 +590,7 @@ def filter_nodeset(nodeset) def evaluate_predicate(expression, nodesets) enter(:predicate, expression, nodesets) if @debug + new_nodeset_count = 0 new_nodesets = nodesets.collect do |nodeset| new_nodeset = [] subcontext = { :size => nodeset.size } @@ -606,17 +607,20 @@ def evaluate_predicate(expression, nodesets) result = result[0] if result.kind_of? Array and result.length == 1 if result.kind_of? Numeric if result == node.position - new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1) + new_nodeset_count += 1 + new_nodeset << XPathNode.new(node, position: new_nodeset_count) end elsif result.instance_of? Array if result.size > 0 and result.inject(false) {|k,s| s or k} if result.size > 0 - new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1) + new_nodeset_count += 1 + new_nodeset << XPathNode.new(node, position: new_nodeset_count) end end else if result - new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1) + new_nodeset_count += 1 + new_nodeset << XPathNode.new(node, position: new_nodeset_count) end end end diff --git a/rexml.gemspec b/rexml.gemspec index 620a8981..0de3e845 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -16,6 +16,10 @@ Gem::Specification.new do |spec| spec.homepage = "https://github.com/ruby/rexml" spec.license = "BSD-2-Clause" + spec.metadata = { + "changelog_uri" => "#{spec.homepage}/releases/tag/v#{spec.version}" + } + files = [ "LICENSE.txt", "NEWS.md", @@ -52,10 +56,8 @@ Gem::Specification.new do |spec| spec.files = files spec.rdoc_options.concat(["--main", "README.md"]) spec.extra_rdoc_files = rdoc_files - spec.bindir = "exe" - spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } - spec.add_development_dependency "bundler" - spec.add_development_dependency "rake" - spec.add_development_dependency "test-unit" + spec.required_ruby_version = '>= 2.5.0' + + spec.add_runtime_dependency("strscan") end diff --git a/test/data/much_ado.xml b/test/data/much_ado.xml index f008fadb..0040088c 100644 --- a/test/data/much_ado.xml +++ b/test/data/much_ado.xml @@ -4735,7 +4735,7 @@ CLAUDIO, BENEDICK, HERO, BEATRICE, and Attendants But they shall find, awaked in such a kind, Both strength of limb and policy of mind, Ability in means and choice of friends, -To quit me of them throughly. +To quit me of them thoroughly. diff --git a/test/data/ofbiz-issues-full-177.xml b/test/data/ofbiz-issues-full-177.xml index bfff771d..e1f7bdfd 100644 --- a/test/data/ofbiz-issues-full-177.xml +++ b/test/data/ofbiz-issues-full-177.xml @@ -152,8 +152,8 @@ - - + + diff --git a/test/data/test/tests.xml b/test/data/test/tests.xml index cf03b42b..fd415679 100644 --- a/test/data/test/tests.xml +++ b/test/data/test/tests.xml @@ -299,7 +299,7 @@ - + web-app web-app web-app @@ -318,7 +318,7 @@ - + web-app web-app web-app diff --git a/test/data/tutorial.xml b/test/data/tutorial.xml index bf5783d0..9c4639b9 100644 --- a/test/data/tutorial.xml +++ b/test/data/tutorial.xml @@ -286,7 +286,7 @@ el1 << Text.new(" cruel world") strings.

I can't emphasize this enough, because people do have problems with - this. REXML can't possibly alway guess correctly how your text is + this. REXML can't possibly always guess correctly how your text is encoded, so it always assumes the text is UTF-8. It also does not warn you when you try to add text which isn't properly encoded, for the same reason. You must make sure that you are adding UTF-8 text. diff --git a/test/formatter/test_default.rb b/test/formatter/test_default.rb index 321d8180..aa403dbe 100644 --- a/test/formatter/test_default.rb +++ b/test/formatter/test_default.rb @@ -2,7 +2,7 @@ module REXMLTests class DefaultFormatterTest < Test::Unit::TestCase def format(node) formatter = REXML::Formatters::Default.new - output = "" + output = +"" formatter.write(node, output) output end diff --git a/test/functions/test_base.rb b/test/functions/test_base.rb index 74dc1a31..daa38156 100644 --- a/test/functions/test_base.rb +++ b/test/functions/test_base.rb @@ -229,8 +229,30 @@ def test_normalize_space assert_equal( [REXML::Comment.new("COMMENT A")], m ) end + def test_normalize_space_strings + source = <<-XML +breakfast boosts\t\t + +concentration +Coffee beans + aroma + + + + Dessert + \t\t after dinner + XML + normalized_texts = REXML::XPath.each(REXML::Document.new(source), "normalize-space(//text())").to_a + assert_equal([ + "breakfast boosts concentration", + "Coffee beans aroma", + "Dessert after dinner", + ], + normalized_texts) + end + def test_string_nil_without_context - doc = REXML::Document.new(<<-XML) + doc = REXML::Document.new(<<~XML) diff --git a/test/parse/test_attribute_list_declaration.rb b/test/parse/test_attribute_list_declaration.rb new file mode 100644 index 00000000..43882528 --- /dev/null +++ b/test/parse/test_attribute_list_declaration.rb @@ -0,0 +1,30 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseAttributeListDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_space + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]>") + end + end + + def test_linear_performance_tab_and_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + + "\">]>") + end + end + end +end diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb new file mode 100644 index 00000000..b5f1a3bc --- /dev/null +++ b/test/parse/test_cdata.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCData < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' ]]>') + end + end + end +end diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb new file mode 100644 index 00000000..bf8d2190 --- /dev/null +++ b/test/parse/test_character_reference.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCharacterReference < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_many_preceding_zeros + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + end +end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb new file mode 100644 index 00000000..4475dca7 --- /dev/null +++ b/test/parse/test_comment.rb @@ -0,0 +1,151 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseComment < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def parse(xml) + REXML::Document.new(xml) + end + + class TestInvalid < self + def test_toplevel_unclosed_comment + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 11 + Last 80 unconsumed characters: + DETAIL + end + + def test_toplevel_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 9 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 26 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 24 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_short + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed comment + Line: 1 + Position: 8 + Last 80 unconsumed characters: + --> + DETAIL + end + + def test_after_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 14 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 12 + Last 80 unconsumed characters: + DETAIL + end + end + + def test_before_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal(" ok comment ", events[:comment]) + end + + def test_after_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal(" ok comment ", events[:comment]) + end + + def test_linear_performance_top_level_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + + def test_linear_performance_in_element_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + end +end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 55713909..99c23745 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -1,9 +1,13 @@ # frozen_string_literal: false require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseDocumentTypeDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + private def parse(doctype) REXML::Document.new(<<-XML).doctype @@ -36,6 +40,66 @@ def test_garbage_plus_before_name_at_line_start + r SYSTEM "urn:x-rexml:test" [ ]> DETAIL end + + def test_no_name + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: name is missing +Line: 3 +Position: 17 +Last 80 unconsumed characters: + + DETAIL + end + end + + class TestUnclosed < self + def test_no_extra_node + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(" + DOCTYPE + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed DOCTYPE: invalid declaration + Line: 1 + Position: 20 + Last 80 unconsumed characters: + #{' '} + DETAIL + end + + def test_text + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(<<~DOCTYPE) + " * n + "]>") + rescue + end + end + end + + def test_linear_performance_comment_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + " -->]>") + end + end + + def test_linear_performance_external_entity_right_bracket_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + ";]>") + end + end end end diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 9f172a28..2b0746ea 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseElement < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -41,9 +45,22 @@ def test_empty_namespace_attribute_name assert_equal(<<-DETAIL.chomp, exception.to_s) Invalid attribute name: <:a=""> Line: 1 -Position: 9 +Position: 13 Last 80 unconsumed characters: +:a=""> + DETAIL + end + def test_empty_namespace_attribute_name_with_utf8_character + exception = assert_raise(REXML::ParseException) do + parse("") # U+200B ZERO WIDTH SPACE + end + assert_equal(<<-DETAIL.chomp.force_encoding("ASCII-8BIT"), exception.to_s) +Invalid attribute name: <:\xE2\x80\x8B> +Line: 1 +Position: 8 +Last 80 unconsumed characters: +:\xE2\x80\x8B> DETAIL end @@ -72,6 +89,47 @@ def test_garbage_less_than_slash_before_end_tag_at_line_start DETAIL end + + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '" * n + '">') + end end end end diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb new file mode 100644 index 00000000..81d95b58 --- /dev/null +++ b/test/parse/test_entity_declaration.rb @@ -0,0 +1,557 @@ +# frozen_string_literal: false +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseEntityDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + private + def xml(internal_subset) + <<-XML + + + XML + end + + def parse(internal_subset) + REXML::Document.new(xml(internal_subset)).doctype + end + + public + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-GEDecl + class TestGeneralEntityDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name + class TestName < self + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + invalid&name "valid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityDef + class TestEntityDefinition < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue + class TestEntityValue < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 59 +Last 80 unconsumed characters: + valid-name invalid-entity-value>]> + DETAIL + end + + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 44 +Last 80 unconsumed characters: + valid-name "% &">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + valid-name "invalid-entity-value'>]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ExternalID + class TestExternalID < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-SystemLiteral + class TestSystemLiteral < self + def test_no_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 68 +Last 80 unconsumed characters: + valid-name SYSTEM invalid-system-literal>]> + DETAIL + end + + def test_no_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 90 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" invalid-system-literal>]> + DETAIL + end + + def test_mixed_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 70 +Last 80 unconsumed characters: + valid-name SYSTEM 'invalid-system-literal">]> + DETAIL + end + + def test_mixed_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" "invalid-system-literal'>]> + DETAIL + end + + def test_no_literal_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 45 +Last 80 unconsumed characters: + valid-name SYSTEM>]> + DETAIL + end + + def test_no_literal_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 67 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidChar + class TestPublicIDLiteral < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 90 +Last 80 unconsumed characters: + valid-name PUBLIC invalid-pubid-literal "valid-system-literal">]> + DETAIL + end + + def test_prohibited_pubid_character + exception = assert_raise(REXML::ParseException) do + # U+3042 HIRAGANA LETTER A + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.force_encoding('utf-8').chomp, exception.to_s.force_encoding('utf-8')) +Malformed entity declaration +Line: 1 +Position: 74 +Last 80 unconsumed characters: + valid-name PUBLIC "\u3042" "valid-system-literal">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + valid-name PUBLIC "invalid-pubid-literal' "valid-system-literal">]> + DETAIL + end + + def test_no_literal + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 45 +Last 80 unconsumed characters: + valid-name PUBLIC>]> + DETAIL + end + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NDataDecl + class TestNotationDataDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameChar + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 109 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" "valid-system-literal" NDATA invalid&nam + DETAIL + end + end + + def test_entity_value_and_notation_data_declaration + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 83 +Last 80 unconsumed characters: + valid-name "valid-entity-value" NDATA valid-ndata-value>]> + DETAIL + end + end + + def test_no_space + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 102 +Last 80 unconsumed characters: + valid-namePUBLIC"valid-pubid-literal""valid-system-literal"NDATAvalid-name>]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEDecl + class TestParsedEntityDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name + class TestName < self + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 63 +Last 80 unconsumed characters: + % invalid&name "valid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEDef + class TestParsedEntityDefinition < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue + class TestEntityValue < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + % valid-name invalid-entity-value>]> + DETAIL + end + + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 46 +Last 80 unconsumed characters: + % valid-name "% &">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 63 +Last 80 unconsumed characters: + % valid-name 'invalid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ExternalID + class TestExternalID < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-SystemLiteral + class TestSystemLiteral < self + def test_no_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 70 +Last 80 unconsumed characters: + % valid-name SYSTEM invalid-system-literal>]> + DETAIL + end + + def test_no_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal" invalid-system-literal>]> + DETAIL + end + + def test_mixed_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 72 +Last 80 unconsumed characters: + % valid-name SYSTEM "invalid-system-literal'>]> + DETAIL + end + + def test_mixed_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 94 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal" 'invalid-system-literal">]> + DETAIL + end + + def test_no_literal_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 47 +Last 80 unconsumed characters: + % valid-name SYSTEM>]> + DETAIL + end + + def test_no_literal_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 69 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidChar + class TestPublicIDLiteral < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + % valid-name PUBLIC invalid-pubid-literal "valid-system-literal">]> + DETAIL + end + + def test_prohibited_pubid_character + exception = assert_raise(REXML::ParseException) do + # U+3042 HIRAGANA LETTER A + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.force_encoding('utf-8').chomp, exception.to_s.force_encoding('utf-8')) +Malformed entity declaration +Line: 1 +Position: 76 +Last 80 unconsumed characters: + % valid-name PUBLIC "\u3042" "valid-system-literal">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 94 +Last 80 unconsumed characters: + % valid-name PUBLIC 'invalid-pubid-literal" "valid-system-literal">]> + DETAIL + end + + def test_no_literal + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 47 +Last 80 unconsumed characters: + % valid-name PUBLIC>]> + DETAIL + end + end + end + + def test_entity_value_and_notation_data_declaration + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 85 +Last 80 unconsumed characters: + % valid-name "valid-entity-value" NDATA valid-ndata-value>]> + DETAIL + end + end + + def test_no_space + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 67 +Last 80 unconsumed characters: + %valid-nameSYSTEM"valid-system-literal">]> + DETAIL + end + end + + def test_empty + exception = assert_raise(REXML::ParseException) do + parse(<<-INTERNAL_SUBSET) + + INTERNAL_SUBSET + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 5 +Position: 70 +Last 80 unconsumed characters: +> ]> + DETAIL + end + + def test_linear_performance_entity_value_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + + "\">]>") + end + end + + def test_linear_performance_entity_value_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") + end + end + + def test_linear_performance_system_literal_in_system_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") + end + end + + def test_linear_performance_system_literal_in_public_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") + end + end + end +end diff --git a/test/parse/test_notation_declaration.rb b/test/parse/test_notation_declaration.rb index 19a0536d..9e81b6a4 100644 --- a/test/parse/test_notation_declaration.rb +++ b/test/parse/test_notation_declaration.rb @@ -35,7 +35,7 @@ def test_no_name Line: 5 Position: 72 Last 80 unconsumed characters: - ]> + ]> DETAIL end diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index f0c0c24e..ba381dc4 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests - class TestParseProcessinInstruction < Test::Unit::TestCase + class TestParseProcessingInstruction < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -13,31 +17,110 @@ def test_no_name parse("") end assert_equal(<<-DETAIL.chomp, exception.to_s) -Invalid processing instruction node +Malformed XML: Invalid processing instruction node: invalid name Line: 1 Position: 4 Last 80 unconsumed characters: - +?> + DETAIL + end + + def test_unclosed_content + exception = assert_raise(REXML::ParseException) do + parse("') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration is not at the start + Line: 1 + Position: 25 + Last 80 unconsumed characters: + DETAIL end + end - def test_garbage_text - # TODO: This should be parse error. - # Create test/parse/test_document.rb or something and move this to it. - doc = parse(<<-XML) -x?> - XML - pi = doc.children[1] - assert_equal([ - "x", - "y\n"]], + [[doc.children[0].target, doc.children[0].content], + [doc.children[1].target, doc.children[1].content]]) + end + + def test_before_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal("abc", events[:processing_instruction]) + end + + def test_after_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal("abc", events[:processing_instruction]) + end + + def test_content_question + document = REXML::Document.new("") + assert_equal("con?tent", document.root.children.first.content) + end + + def test_linear_performance_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + " ?>") + end + end + + def test_linear_performance_tab + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new(" ?>") end end end diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb new file mode 100644 index 00000000..04f553ae --- /dev/null +++ b/test/parse/test_text.rb @@ -0,0 +1,57 @@ +require "test/unit" +require 'rexml/parsers/baseparser' + +module REXMLTests + class TestParseText < Test::Unit::TestCase + class TestInvalid < self + def test_before_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('b') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Content at the start of the document (got 'b') + Line: 1 + Position: 4 + Last 80 unconsumed characters: + + DETAIL + end + + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('c') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra content at the end of the document (got 'c') + Line: 1 + Position: 8 + Last 80 unconsumed characters: + + DETAIL + end + end + + def test_whitespace_characters_after_root + parser = REXML::Parsers::BaseParser.new('b ') + + events = [] + while parser.has_next? + event = parser.pull + case event[0] + when :text + events << event[1] + end + end + + assert_equal(["b"], events) + end + end +end diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb new file mode 100644 index 00000000..17d01979 --- /dev/null +++ b/test/parser/test_base_parser.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: false + +require 'rexml/parsers/baseparser' + +module REXMLTests + class BaseParserTester < Test::Unit::TestCase + def test_large_xml + large_text = "a" * 100_000 + xml = <<-XML + + + #{large_text} + #{large_text} + + XML + + parser = REXML::Parsers::BaseParser.new(xml) + while parser.has_next? + parser.pull + end + + assert do + parser.position < xml.bytesize + end + end + end +end diff --git a/test/parser/test_ultra_light.rb b/test/parser/test_ultra_light.rb index 44fd1d1e..b3f576ff 100644 --- a/test/parser/test_ultra_light.rb +++ b/test/parser/test_ultra_light.rb @@ -17,7 +17,6 @@ def test_entity_declaration [:entitydecl, "name", "value"] ], [:start_element, :parent, "root", {}], - [:text, "\n"], ], parse(<<-INTERNAL_SUBSET)) diff --git a/test/parser/test_xpath.rb b/test/parser/test_xpath.rb new file mode 100644 index 00000000..9143d25c --- /dev/null +++ b/test/parser/test_xpath.rb @@ -0,0 +1,115 @@ +# frozen_string_literal: false + +require "test/unit" +require "rexml/parsers/xpathparser" + +module REXMLTests + class TestXPathParser < Test::Unit::TestCase + sub_test_case("#abbreviate") do + def abbreviate(xpath) + parser = REXML::Parsers::XPathParser.new + parser.abbreviate(xpath) + end + + def test_document + assert_equal("/", + abbreviate("/")) + end + + def test_descendant_or_self_only + assert_equal("//", + abbreviate("/descendant-or-self::node()/")) + end + + def test_descendant_or_self_absolute + assert_equal("//a/b", + abbreviate("/descendant-or-self::node()/a/b")) + end + + def test_descendant_or_self_relative + assert_equal("a//b", + abbreviate("a/descendant-or-self::node()/b")) + end + + def test_descendant_or_self_not_node + assert_equal("/descendant-or-self::text()", + abbreviate("/descendant-or-self::text()")) + end + + def test_self_absolute + assert_equal("/a/./b", + abbreviate("/a/self::node()/b")) + end + + def test_self_relative + assert_equal("a/./b", + abbreviate("a/self::node()/b")) + end + + def test_self_not_node + assert_equal("/self::text()", + abbreviate("/self::text()")) + end + + def test_parent_absolute + assert_equal("/a/../b", + abbreviate("/a/parent::node()/b")) + end + + def test_parent_relative + assert_equal("a/../b", + abbreviate("a/parent::node()/b")) + end + + def test_parent_not_node + assert_equal("/a/parent::text()", + abbreviate("/a/parent::text()")) + end + + def test_any_absolute + assert_equal("/*/a", + abbreviate("/*/a")) + end + + def test_any_relative + assert_equal("a/*/b", + abbreviate("a/*/b")) + end + + def test_following_sibling_absolute + assert_equal("/following-sibling::a/b", + abbreviate("/following-sibling::a/b")) + end + + def test_following_sibling_relative + assert_equal("a/following-sibling::b/c", + abbreviate("a/following-sibling::b/c")) + end + + def test_predicate_index + assert_equal("a[5]/b", + abbreviate("a[5]/b")) + end + + def test_attribute_relative + assert_equal("a/@b", + abbreviate("a/attribute::b")) + end + + def test_filter_attribute + assert_equal("a/b[@i = 1]/c", + abbreviate("a/b[attribute::i=1]/c")) + end + + def test_filter_string_single_quote + assert_equal("a/b[@name = \"single ' quote\"]/c", + abbreviate("a/b[attribute::name=\"single ' quote\"]/c")) + end + + def test_filter_string_double_quote + assert_equal("a/b[@name = 'double \" quote']/c", + abbreviate("a/b[attribute::name='double \" quote']/c")) + end + end + end +end diff --git a/test/test_attributes.rb b/test/test_attributes.rb index 91fc68a5..09fde442 100644 --- a/test/test_attributes.rb +++ b/test/test_attributes.rb @@ -178,18 +178,27 @@ def test_amp_and_lf_attributes attr_test('name','value with LF & ampersand') end - def test_quoting + def test_quote_root d = Document.new(%q{}) assert_equal( %q{}, d.to_s ) d.root.context[:attribute_quote] = :quote assert_equal( %q{}, d.to_s ) + end + def test_quote_sub_element d = Document.new(%q{}) assert_equal( %q{}, d.to_s ) d.root.context[:attribute_quote] = :quote assert_equal( %q{}, d.to_s ) end + def test_quote_to_s_value + doc = Document.new(%q{}, {attribute_quote: :quote}) + assert_equal(%q{}, doc.to_s) + assert_equal("'", doc.root.attribute("a").value) + assert_equal(%q{}, doc.to_s) + end + def test_ticket_127 doc = Document.new doc.add_element 'a', { 'v' => 'x & y' } diff --git a/test/test_contrib.rb b/test/test_contrib.rb index f3ad0b6c..23ee35b1 100644 --- a/test/test_contrib.rb +++ b/test/test_contrib.rb @@ -80,7 +80,7 @@ def test_bad_doctype_Tobias # Peter Verhage def test_namespace_Peter - source = <<-EOF + source = <<~EOF @@ -377,7 +377,7 @@ def test_various_xpath end def test_entities_Holden_Glova - document = <<-EOL + document = <<~EOL diff --git a/test/test_core.rb b/test/test_core.rb index fd3af8c2..e1fba8a7 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -15,7 +15,7 @@ class Tester < Test::Unit::TestCase include Helper::Fixture include REXML def setup - @xsa_source = <<-EOL + @xsa_source = <<~EOL + + + XML + + expected_names = %w[ + root + 1_1 1_2 1_3 + 2_1 2_2 2_3 + ] + + document = REXML::Document.new(xml_source) + + # Node#each_recursive iterates elements only. + # This does not iterate XML declarations, comments, attributes, CDATA sections, etc. + actual_names = [] + document.each_recursive do |element| + actual_names << element.attributes["name"] + end + assert_equal(expected_names, actual_names) + end + class WriteTest < Test::Unit::TestCase def setup - @document = REXML::Document.new(<<-EOX) + @document = REXML::Document.new(<<-EOX.chomp) Hello world! EOX @@ -212,7 +249,7 @@ class ArgumentsTest < self def test_output output = "" @document.write(output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -235,7 +272,7 @@ def test_transitive indent = 2 transitive = true @document.write(output, indent, transitive) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! #{japanese_text} EOX @@ -275,7 +312,7 @@ class OptionsTest < self def test_output output = "" @document.write(:output => output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -295,7 +332,7 @@ def test_indent def test_transitive output = "" @document.write(:output => output, :indent => 2, :transitive => true) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! output, :encoding => encoding) - assert_equal(<<-EOX.encode(encoding), output) + assert_equal(<<-EOX.chomp.encode(encoding), output) #{japanese_text} EOX @@ -401,7 +438,7 @@ def test_utf_16 actual_xml = "" document.write(actual_xml) - expected_xml = <<-EOX.encode("UTF-16BE") + expected_xml = <<-EOX.chomp.encode("UTF-16BE") \ufeff Hello world! EOX diff --git a/test/test_encoding.rb b/test/test_encoding.rb index 09495c58..6887ffbe 100644 --- a/test/test_encoding.rb +++ b/test/test_encoding.rb @@ -67,7 +67,7 @@ def test_in_different_out # * Given an encoded document, accessing text and attribute nodes # should provide UTF-8 text. def test_in_different_access - doc = Document.new <<-EOL + doc = Document.new <<~EOL \xFF EOL @@ -79,7 +79,7 @@ def test_in_different_access def test_ticket_89 - doc = Document.new <<-EOL + doc = Document.new <<~EOL EOL diff --git a/test/test_light.rb b/test/test_light.rb index 54b2c52e..c556c978 100644 --- a/test/test_light.rb +++ b/test/test_light.rb @@ -62,7 +62,7 @@ def test_access_child_elements assert_equal( 'c', a[1].name ) end - def test_itterate_over_children + def test_iterate_over_children foo = make_small_document ctr = 0 foo[0].each { ctr += 1 } diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 53a985ba..55205af8 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -62,6 +62,63 @@ def test_entity_replacement end end + def test_character_references + source = 'AB' + parser = REXML::Parsers::PullParser.new( source ) + + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + assert_equal('A', events['a']) + assert_equal("B", events['b']) + end + + def test_text_entity_references + source = '<P> <I> <B> Text </B> </I>' + parser = REXML::Parsers::PullParser.new( source ) + + events = [] + while parser.has_next? + event = parser.pull + case event.event_type + when :text + events << event[1] + end + end + + assert_equal(["

Text "], events) + end + + def test_text_content_with_line_breaks + source = "AB\nC\r\n" + parser = REXML::Parsers::PullParser.new( source ) + + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + assert_equal('A', events['a']) + assert_equal("B\n", events['b']) + assert_equal("C\n", events['c']) + end + def test_peek_unshift source = "" REXML::Parsers::PullParser.new(source) @@ -98,5 +155,101 @@ def test_peek end assert_equal( 0, names.length ) end + + class EntityExpansionLimitTest < Test::Unit::TestCase + def setup + @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + end + + def teardown + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + end + + class GeneralEntityTest < self + def test_have_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("entity expansion has grown too large")) do + while parser.has_next? + parser.pull + end + end + end + + def test_empty_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + while parser.has_next? + parser.pull + end + end + + REXML::Security.entity_expansion_limit = 100 + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + while parser.has_next? + parser.pull + end + end + assert_equal(101, parser.entity_expansion_count) + end + + def test_with_default_entity + source = <<-XML + + + +]> + +&a; +&a2; +< + + XML + + REXML::Security.entity_expansion_limit = 4 + parser = REXML::Parsers::PullParser.new(source) + while parser.has_next? + parser.pull + end + + REXML::Security.entity_expansion_limit = 3 + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + while parser.has_next? + parser.pull + end + end + end + end + end end end diff --git a/test/test_sax.rb b/test/test_sax.rb index 6f775183..5e3ad75b 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -31,6 +31,17 @@ def test_entity_replacement assert_equal '--1234--', results[1] end + def test_characters_predefined_entities + source = '<P> <I> <B> Text </B> </I>' + + sax = Parsers::SAX2Parser.new( source ) + results = [] + sax.listen(:characters) {|x| results << x } + sax.parse + + assert_equal(["

Text "], results) + end + def test_sax2 File.open(fixture_path("documentation.xml")) do |f| parser = Parsers::SAX2Parser.new( f ) @@ -88,6 +99,92 @@ def test_sax2 end end + class EntityExpansionLimitTest < Test::Unit::TestCase + def setup + @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + end + + def teardown + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + end + + class GeneralEntityTest < self + def test_have_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("entity expansion has grown too large")) do + sax.parse + end + end + + def test_empty_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + sax.parse + end + + REXML::Security.entity_expansion_limit = 100 + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + sax.parse + end + assert_equal(101, sax.entity_expansion_count) + end + + def test_with_default_entity + source = <<-XML + + + +]> + +&a; +&a2; +< + + XML + + REXML::Security.entity_expansion_limit = 4 + sax = REXML::Parsers::SAX2Parser.new(source) + sax.parse + + REXML::Security.entity_expansion_limit = 3 + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + sax.parse + end + end + end + end + # used by test_simple_doctype_listener # submitted by Jeff Barczewski class SimpleDoctypeListener @@ -109,7 +206,7 @@ def doctype(name, pub_sys, long_name, uri) # test simple non-entity doctype in sax listener # submitted by Jeff Barczewski def test_simple_doctype_listener - xml = <<-END + xml = <<~END Hello, world! @@ -140,8 +237,8 @@ def test_simple_doctype_listener # test doctype with missing name, should throw ParseException # submitted by Jeff Barczewseki - def test_doctype_with_mising_name_throws_exception - xml = <<-END + def test_doctype_with_missing_name_throws_exception + xml = <<~END Hello, world! diff --git a/test/test_text_check.rb b/test/test_text_check.rb new file mode 100644 index 00000000..11cf65a3 --- /dev/null +++ b/test/test_text_check.rb @@ -0,0 +1,121 @@ +# frozen_string_literal: false + +module REXMLTests + class TextCheckTester < Test::Unit::TestCase + + def check(string) + REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil) + end + + def assert_check(string) + assert_nothing_raised { check(string) } + end + + def assert_check_failed(string, illegal_part) + message = "Illegal character #{illegal_part.inspect} in raw string #{string.inspect}" + assert_raise(RuntimeError.new(message)) do + check(string) + end + end + + class TestValid < self + def test_entity_name_start_char_colon + assert_check("&:;") + end + + def test_entity_name_start_char_under_score + assert_check("&_;") + end + + def test_entity_name_mix + assert_check("&A.b-0123;") + end + + def test_character_reference_decimal + assert_check("¢") + end + + def test_character_reference_hex + assert_check("􏿿") + end + + def test_entity_name_non_ascii + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + assert_check("&\u3042\u3044;") + end + + def test_normal_string + assert_check("foo") + end + end + + class TestInvalid < self + def test_lt + assert_check_failed("<;", "<") + end + + def test_lt_mix + assert_check_failed("ab @@ -24,7 +24,7 @@ def test_validate - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) @@ -33,7 +33,7 @@ def test_validate def test_sequence - rng = %q{ + rng = <<-XML @@ -45,7 +45,7 @@ def test_sequence - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -56,7 +56,7 @@ def test_sequence def test_choice - rng = %q{ + rng = <<-XML @@ -70,7 +70,7 @@ def test_choice - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -79,7 +79,7 @@ def test_choice end def test_optional - rng = %q{ + rng = <<-XML @@ -90,7 +90,7 @@ def test_optional - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) @@ -100,7 +100,7 @@ def test_optional end def test_zero_or_more - rng = %q{ + rng = <<-XML @@ -111,7 +111,7 @@ def test_zero_or_more - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) no_error( validator, %q{} ) @@ -119,7 +119,7 @@ def test_zero_or_more error( validator, %q{} ) error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -133,7 +133,7 @@ def test_zero_or_more - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) @@ -143,7 +143,7 @@ def test_zero_or_more end def test_one_or_more - rng = %q{ + rng = <<-XML @@ -154,7 +154,7 @@ def test_one_or_more - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -165,13 +165,13 @@ def test_one_or_more end def test_attribute - rng = %q{ + rng = <<-XML - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -181,7 +181,7 @@ def test_attribute end def test_choice_attributes - rng = %q{ + rng = <<-XML @@ -189,7 +189,7 @@ def test_choice_attributes - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -199,7 +199,7 @@ def test_choice_attributes end def test_choice_attribute_element - rng = %q{ + rng = <<-XML @@ -207,7 +207,7 @@ def test_choice_attribute_element - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -217,12 +217,12 @@ def test_choice_attribute_element end def test_empty - rng = %q{ + rng = <<-XML - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -231,12 +231,12 @@ def test_empty end def test_text_val - rng = %q{ + rng = <<-XML - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -245,7 +245,7 @@ def test_text_val end def test_choice_text - rng = %q{ + rng = <<-XML @@ -253,7 +253,7 @@ def test_choice_text - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{Text} ) @@ -263,7 +263,7 @@ def test_choice_text end def test_group - rng = %q{ + rng = <<-XML @@ -274,7 +274,7 @@ def test_group - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -282,7 +282,7 @@ def test_group no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -291,7 +291,7 @@ def test_group - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -302,14 +302,14 @@ def test_group def test_value # Values as text nodes - rng = %q{ + rng = <<-XML VaLuE - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{X} ) @@ -317,7 +317,7 @@ def test_value no_error( validator, %q{VaLuE} ) # Values as text nodes, via choice - rng = %q{ + rng = <<-XML @@ -327,7 +327,7 @@ def test_value - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -336,14 +336,14 @@ def test_value no_error( validator, %q{Option 2} ) # Attribute values - rng = %q{ + rng = <<-XML VaLuE - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -352,7 +352,7 @@ def test_value no_error( validator, %q{} ) # Attribute values via choice - rng = %q{ + rng = <<-XML @@ -362,7 +362,7 @@ def test_value - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -372,7 +372,7 @@ def test_value end def test_interleave - rng = %q{ + rng = <<-XML @@ -383,7 +383,7 @@ def test_interleave - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -396,7 +396,7 @@ def test_interleave end def test_mixed - rng = %q{ + rng = <<-XML @@ -405,7 +405,7 @@ def test_mixed - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{Text} ) @@ -413,7 +413,7 @@ def test_mixed end def test_ref_sequence - rng = %q{ + rng = <<-XML @@ -429,7 +429,7 @@ def test_ref_sequence - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) @@ -437,7 +437,7 @@ def test_ref_sequence end def test_ref_choice - rng = %q{ + rng = <<-XML @@ -453,7 +453,7 @@ def test_ref_choice - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -461,7 +461,7 @@ def test_ref_choice no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -477,7 +477,7 @@ def test_ref_choice - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -485,7 +485,7 @@ def test_ref_choice no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -502,7 +502,7 @@ def test_ref_choice - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -513,7 +513,7 @@ def test_ref_choice def test_ref_zero_plus - rng = %q{ + rng = <<-XML @@ -530,7 +530,7 @@ def test_ref_zero_plus - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -538,7 +538,7 @@ def test_ref_zero_plus no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -555,7 +555,7 @@ def test_ref_zero_plus - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -566,7 +566,7 @@ def test_ref_zero_plus def test_ref_one_plus - rng = %q{ + rng = <<-XML @@ -583,7 +583,7 @@ def test_ref_one_plus - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -591,7 +591,7 @@ def test_ref_one_plus no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -608,7 +608,7 @@ def test_ref_one_plus - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -618,7 +618,7 @@ def test_ref_one_plus end def test_ref_interleave - rng = %q{ + rng = <<-XML @@ -634,7 +634,7 @@ def test_ref_interleave - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -643,7 +643,7 @@ def test_ref_interleave no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -659,7 +659,7 @@ def test_ref_interleave - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -668,7 +668,7 @@ def test_ref_interleave no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -687,7 +687,7 @@ def test_ref_interleave - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -698,7 +698,7 @@ def test_ref_interleave end def test_ref_recurse - rng = %q{ + rng = <<-XML @@ -715,7 +715,7 @@ def test_ref_recurse - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -724,7 +724,7 @@ def test_ref_recurse end def test_ref_optional - rng = %q{ + rng = <<-XML @@ -740,7 +740,7 @@ def test_ref_optional - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) @@ -748,7 +748,7 @@ def test_ref_optional error( validator, %q{} ) error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -764,7 +764,7 @@ def test_ref_optional - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) diff --git a/test/test_xml_declaration.rb b/test/test_xml_declaration.rb index 6db54bab..6a1f4df0 100644 --- a/test/test_xml_declaration.rb +++ b/test/test_xml_declaration.rb @@ -6,7 +6,7 @@ module REXMLTests class TestXmlDeclaration < Test::Unit::TestCase def setup - xml = <<-XML + xml = <<~XML diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 5156bbbe..1dacd69d 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -451,6 +451,46 @@ def test_following # puts results #end + def test_nested_predicates + doc = Document.new <<-EOF +

+
+ ab + cd +
+
+ ef + gh +
+
+ hi +
+
+ EOF + + matches = XPath.match(doc, '(/div/div/test[0])').map(&:text) + assert_equal [], matches + matches = XPath.match(doc, '(/div/div/test[1])').map(&:text) + assert_equal ["ab", "ef", "hi"], matches + matches = XPath.match(doc, '(/div/div/test[2])').map(&:text) + assert_equal ["cd", "gh"], matches + matches = XPath.match(doc, '(/div/div/test[3])').map(&:text) + assert_equal [], matches + + matches = XPath.match(doc, '(/div/div/test[1])[1]').map(&:text) + assert_equal ["ab"], matches + matches = XPath.match(doc, '(/div/div/test[1])[2]').map(&:text) + assert_equal ["ef"], matches + matches = XPath.match(doc, '(/div/div/test[1])[3]').map(&:text) + assert_equal ["hi"], matches + matches = XPath.match(doc, '(/div/div/test[2])[1]').map(&:text) + assert_equal ["cd"], matches + matches = XPath.match(doc, '(/div/div/test[2])[2]').map(&:text) + assert_equal ["gh"], matches + matches = XPath.match(doc, '(/div/div/test[2])[3]').map(&:text) + assert_equal [], matches + end + # Contributed by Mike Stok def test_starts_with source = <<-EOF @@ -611,7 +651,7 @@ def test_comparisons source = "" doc = REXML::Document.new(source) - # NOTE TO SER: check that number() is required + # NOTE: check that number() is required assert_equal 2, REXML::XPath.match(doc, "//b[number(@id) > 1]").size assert_equal 3, REXML::XPath.match(doc, "//b[number(@id) >= 1]").size assert_equal 1, REXML::XPath.match(doc, "//b[number(@id) <= 1]").size diff --git a/test/xpath/test_predicate.rb b/test/xpath/test_predicate.rb index c8520712..278e3765 100644 --- a/test/xpath/test_predicate.rb +++ b/test/xpath/test_predicate.rb @@ -6,7 +6,7 @@ module REXMLTests class TestXPathPredicate < Test::Unit::TestCase include REXML - SRC=<<-EOL + SRC=<<~EOL
free flowing text.