Skip to content

Commit

Permalink
Disable namespaces in Element#selectXpath
Browse files Browse the repository at this point in the history
Simplifies use of xpath queries when there are xmlns attributes set, by allowing elements to be found by their local name, consistently.

Fixes #180
  • Loading branch information
jhy committed Jul 3, 2022
1 parent b873e21 commit 2b573de
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 10 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ jsoup changelog
so that they are indented correctly.
<https://github.com/jhy/jsoup/issues/1798>

* Improvement: in Element#selectXpath(), disable namespace awareness. This makes it possible to always select elements
by their simple local name, regardless of whether an xmlns attribute was set.
<https://github.com/jhy/jsoup/issues/1801>

* Bugfix: when using the readToByteBuffer method, such as in Connection.Response.body(), if the document has not
already been parsed and must be read fully, and there is any maximum buffer size being applied, only the default
internal buffer size is read.
Expand Down
27 changes: 23 additions & 4 deletions src/main/java/org/jsoup/helper/W3CDom.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,40 @@ public class W3CDom {
private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc
private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context


/**
To get support for XPath versions &gt; 1, set this property to the classname of an alternate XPathFactory
implementation. (For e.g. {@code net.sf.saxon.xpath.XPathFactoryImpl}).
*/
public static final String XPathFactoryProperty = "javax.xml.xpath.XPathFactory:jsoup";

protected DocumentBuilderFactory factory;
private boolean namespaceAware = true; // false when using selectXpath, for user's query convenience

public W3CDom() {
factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
}

/**
Returns if this W3C DOM is namespace aware. By default, this will be {@code true}, but is disabled for simplicity
when using XPath selectors in {@link org.jsoup.nodes.Element#selectXpath(String)}.
@return the current namespace aware setting.
*/
public boolean namespaceAware() {
return namespaceAware;
}

/**
Update the namespace aware setting. This impacts the factory that is used to create W3C nodes from jsoup nodes.
@param namespaceAware the updated setting
@return this W3CDom, for chaining.
*/
public W3CDom namespaceAware(boolean namespaceAware) {
this.namespaceAware = namespaceAware;
factory.setNamespaceAware(namespaceAware);
return this;
}

/**
* Converts a jsoup DOM to a W3C DOM.
*
Expand All @@ -92,7 +112,6 @@ public static Document convert(org.jsoup.nodes.Document in) {
* @see OutputKeys#STANDALONE
* @see OutputKeys#STANDALONE
* @see OutputKeys#DOCTYPE_PUBLIC
* @see OutputKeys#DOCTYPE_PUBLIC
* @see OutputKeys#CDATA_SECTION_ELEMENTS
* @see OutputKeys#INDENT
* @see OutputKeys#MEDIA_TYPE
Expand Down Expand Up @@ -314,7 +333,7 @@ public String asString(Document doc) {
/**
* Implements the conversion by walking the input.
*/
protected static class W3CBuilder implements NodeVisitor {
protected class W3CBuilder implements NodeVisitor {
private static final String xmlnsKey = "xmlns";
private static final String xmlnsPrefix = "xmlns:";

Expand All @@ -337,7 +356,7 @@ public void head(org.jsoup.nodes.Node source, int depth) {
org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;

String prefix = updateNamespaces(sourceEl);
String namespace = namespacesStack.peek().get(prefix);
String namespace = namespaceAware ? namespacesStack.peek().get(prefix) : null;
String tagName = sourceEl.tagName();

/* Tag names in XML are quite permissive, but less permissive than HTML. Rather than reimplement the validation,
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,8 @@ public boolean is(Evaluator evaluator) {

/**
Find Elements that match the supplied XPath expression.
<p>Note that for convenience of writing the Xpath expression, namespaces are disabled, and queries can be
expressed using the elements local name only.</p>
<p>By default, XPath 1.0 expressions are supported. If you would to use XPath 2.0 or higher, you can provide an
alternate XPathFactory implementation:</p>
<ol>
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/nodes/NodeUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ static <T extends Node> List<T> selectXpath(String xpath, Element el, Class<T> n
Validate.notNull(el);
Validate.notNull(nodeType);

W3CDom w3c = new W3CDom();
W3CDom w3c = new W3CDom().namespaceAware(false);
org.w3c.dom.Document wDoc = w3c.fromJsoup(el);
org.w3c.dom.Node contextNode = w3c.contextNode(wDoc);
NodeList nodeList = w3c.selectXpath(xpath, contextNode);
Expand Down
14 changes: 14 additions & 0 deletions src/test/java/org/jsoup/helper/W3CDomTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,20 @@ public void xmlnsXpathTest() throws XPathExpressionException {
assertNull(nodeList);
}

@Test
void canDisableNamespaces() throws XPathExpressionException {
W3CDom w3c = new W3CDom();
assertTrue(w3c.namespaceAware());

w3c.namespaceAware(false);
assertFalse(w3c.namespaceAware());

String html = "<html xmlns='http://www.w3.org/1999/xhtml'><body id='One'><div>hello</div></body></html>";
Document dom = w3c.fromJsoup(Jsoup.parse(html));
NodeList nodeList = xpath(dom, "//body");// no ns, so needs no prefix
assertEquals("div", nodeList.item(0).getLocalName());
}

private NodeList xpath(Document w3cDoc, String query) throws XPathExpressionException {
XPathExpression xpath = XPathFactory.newInstance().newXPath().compile(query);
return ((NodeList) xpath.evaluate(w3cDoc, XPathConstants.NODE));
Expand Down
45 changes: 40 additions & 5 deletions src/test/java/org/jsoup/select/XpathTest.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.jsoup.select;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
Expand All @@ -17,7 +16,6 @@
import javax.xml.xpath.XPathFactoryConfigurationException;
import javax.xml.xpath.XPathFunctionResolver;
import javax.xml.xpath.XPathVariableResolver;

import java.util.List;
import java.util.stream.Stream;

Expand Down Expand Up @@ -76,8 +74,8 @@ public void throwsSelectException() {
}

@Test
public void supportsNamespaces() {
String xhtml = "<html xmlns='http://www.w3.org/1999/xhtml'><body id='One'><div>hello</div></body></html>";;
public void supportsLocalname() {
String xhtml = "<html xmlns='http://www.w3.org/1999/xhtml'><body id='One'><div>hello</div></body></html>";
Document doc = Jsoup.parse(xhtml, Parser.xmlParser());
Elements elements = doc.selectXpath("//*[local-name()='body']");
assertEquals(1, elements.size());
Expand All @@ -86,7 +84,7 @@ public void supportsNamespaces() {

@Test
public void canDitchNamespaces() {
String xhtml = "<html xmlns='http://www.w3.org/1999/xhtml'><body id='One'><div>hello</div></body></html>";;
String xhtml = "<html xmlns='http://www.w3.org/1999/xhtml'><body id='One'><div>hello</div></body></html>";
Document doc = Jsoup.parse(xhtml, Parser.xmlParser());
doc.select("[xmlns]").removeAttr("xmlns");
Elements elements = doc.selectXpath("//*[local-name()='body']");
Expand Down Expand Up @@ -192,8 +190,45 @@ public void canSupplyAlternateFactoryImpl() {
}
assertTrue(threw);
System.clearProperty(XPathFactoryProperty);
}

@Test
public void notNamespaceAware() {
String xhtml = "<html xmlns='http://www.w3.org/1999/xhtml'><body id='One'><div>hello</div></body></html>";
Document doc = Jsoup.parse(xhtml, Parser.xmlParser());
Elements elements = doc.selectXpath("//body");
assertEquals(1, elements.size());
assertEquals("One", elements.first().id());
}

@Test
public void supportsPrefixes() {
// example from https://www.w3.org/TR/xml-names/
String xml = "<?xml version=\"1.0\"?>\n" +
"<bk:book xmlns:bk='urn:loc.gov:books'\n" +
" xmlns:isbn='urn:ISBN:0-395-36341-6'>\n" +
" <bk:title>Cheaper by the Dozen</bk:title>\n" +
" <isbn:number>1568491379</isbn:number>\n" +
"</bk:book>";
Document doc = Jsoup.parse(xml, Parser.xmlParser());

//Elements elements = doc.selectXpath("//bk:book/bk:title");
Elements elements = doc.selectXpath("//book/title");
assertEquals(1, elements.size());
assertEquals("Cheaper by the Dozen", elements.first().text());

// with prefix
Elements byPrefix = doc.selectXpath("//*[name()='bk:book']/*[name()='bk:title']");
assertEquals(1, byPrefix.size());
assertEquals("Cheaper by the Dozen", byPrefix.first().text());

Elements byLocalName = doc.selectXpath("//*[local-name()='book']/*[local-name()='title']");
assertEquals(1, byLocalName.size());
assertEquals("Cheaper by the Dozen", byLocalName.first().text());

Elements isbn = doc.selectXpath("//book/number");
assertEquals(1, isbn.size());
assertEquals("1568491379", isbn.first().text());
}

// minimal, no-op implementation class to verify users can load a factory to support XPath 2.0 etc
Expand Down

0 comments on commit 2b573de

Please sign in to comment.