Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2fcc118

Browse files
author
eugenp
committed
scrape test
1 parent f9712c2 commit 2fcc118

File tree

2 files changed

+155
-0
lines changed

2 files changed

+155
-0
lines changed

spring-security-rest-custom/pom.xml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,30 @@
99

1010
<dependencies>
1111

12+
<dependency>
13+
<groupId>xml-apis</groupId>
14+
<artifactId>xml-apis</artifactId>
15+
<version>1.4.01</version>
16+
</dependency>
17+
18+
<dependency>
19+
<groupId>xalan</groupId>
20+
<artifactId>xalan</artifactId>
21+
<version>2.7.1</version>
22+
</dependency>
23+
24+
<dependency>
25+
<groupId>net.sourceforge.nekohtml</groupId>
26+
<artifactId>nekohtml</artifactId>
27+
<version>1.9.18</version>
28+
</dependency>
29+
30+
<dependency>
31+
<groupId>net.sourceforge.htmlcleaner</groupId>
32+
<artifactId>htmlcleaner</artifactId>
33+
<version>2.6</version>
34+
</dependency>
35+
1236
<!-- Spring Security -->
1337

1438
<dependency>
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
package org.baeldung.live;
2+
3+
import java.io.IOException;
4+
import java.io.InputStream;
5+
import java.net.MalformedURLException;
6+
import java.net.URL;
7+
8+
import javax.xml.parsers.DocumentBuilder;
9+
import javax.xml.parsers.DocumentBuilderFactory;
10+
import javax.xml.parsers.ParserConfigurationException;
11+
import javax.xml.xpath.XPath;
12+
import javax.xml.xpath.XPathConstants;
13+
import javax.xml.xpath.XPathExpression;
14+
import javax.xml.xpath.XPathExpressionException;
15+
import javax.xml.xpath.XPathFactory;
16+
17+
import org.apache.http.HttpEntity;
18+
import org.apache.http.HttpResponse;
19+
import org.apache.http.client.ClientProtocolException;
20+
import org.apache.http.client.methods.HttpGet;
21+
import org.apache.http.impl.client.DefaultHttpClient;
22+
import org.apache.http.params.BasicHttpParams;
23+
import org.apache.http.params.HttpParams;
24+
import org.apache.http.util.EntityUtils;
25+
import org.cyberneko.html.parsers.DOMParser;
26+
import org.htmlcleaner.CleanerProperties;
27+
import org.htmlcleaner.HtmlCleaner;
28+
import org.htmlcleaner.TagNode;
29+
import org.htmlcleaner.XPatherException;
30+
import org.junit.Before;
31+
import org.junit.Test;
32+
import org.w3c.dom.Document;
33+
import org.xml.sax.SAXException;
34+
35+
public class CrawlTest {
36+
37+
private DefaultHttpClient client;
38+
39+
// fixtures
40+
41+
@Before
42+
public final void before() {
43+
final HttpParams httpParameters = new BasicHttpParams();
44+
client = new DefaultHttpClient(httpParameters);
45+
}
46+
47+
// tests
48+
49+
@Test
50+
public final void when_then() throws ClientProtocolException, IOException, XPathExpressionException, SAXException, ParserConfigurationException {
51+
final String url = "http://sales.starcitygames.com/category.php?cat=5260&amp;start=50";
52+
final String xpathEx = ".//*[@id='search_results_table']/tbody/tr/td[1]";
53+
54+
HttpGet request = null;
55+
HttpEntity httpEntity = null;
56+
InputStream entityContentStream = null;
57+
58+
try {
59+
request = new HttpGet(url);
60+
final HttpResponse httpResponse = client.execute(request);
61+
62+
httpEntity = httpResponse.getEntity();
63+
entityContentStream = httpEntity.getContent();
64+
65+
final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
66+
final DocumentBuilder builder = factory.newDocumentBuilder();
67+
final Document doc = builder.parse(entityContentStream);
68+
69+
final XPathFactory xPathfactory = XPathFactory.newInstance();
70+
final XPath xpath = xPathfactory.newXPath();
71+
final XPathExpression expr = xpath.compile(xpathEx);
72+
final Object evaluated = expr.evaluate(doc, XPathConstants.STRING);
73+
System.out.println(evaluated);
74+
} catch (final RuntimeException runEx) {
75+
runEx.printStackTrace();
76+
} finally {
77+
if (request != null) {
78+
request.releaseConnection();
79+
}
80+
if (entityContentStream != null) {
81+
entityContentStream.close();
82+
}
83+
if (httpEntity != null) {
84+
EntityUtils.consume(httpEntity);
85+
}
86+
}
87+
}
88+
89+
// http://htmlcleaner.sourceforge.net/parameters.php
90+
@SuppressWarnings("unused")
91+
@Test
92+
public final void givenCleaningWithHtmlCleaner_whenPageIsRetrieved_thenContentCanBeExtracted() throws XPatherException, MalformedURLException, IOException {
93+
final String url = "http://sales.starcitygames.com/category.php?cat=5260&amp;start=50";
94+
final String xpathEx1 = ".//*[@id='search_results_table']/tbody/tr/td[1]/b/a/text()";
95+
final String xpathEx2 = ".//*[@id='search_results_table']/tbody/tr";
96+
97+
final CleanerProperties props = new CleanerProperties();
98+
// set some properties to non-default values
99+
props.setAdvancedXmlEscape(true);
100+
// props.setOmitComments(true);
101+
102+
// do parsing
103+
final TagNode tagNode = new HtmlCleaner(props).clean(new URL(url));
104+
final Object[] evaluateXPath = tagNode.evaluateXPath(xpathEx2);
105+
final Object ex = evaluateXPath[7];
106+
107+
System.out.println(ex);
108+
// System.out.println(Arrays.toString(evaluateXPath));
109+
// new PrettyXmlSerializer(props).writeToStream(tagNode, System.out);
110+
}
111+
112+
@SuppressWarnings("unused")
113+
@Test
114+
public final void givenCleaningWithNeko_whenPageIsRetrieved_thenContentCanBeExtracted() throws XPatherException, MalformedURLException, IOException, XPathExpressionException, SAXException {
115+
final String url = "http://sales.starcitygames.com/category.php?cat=5260&amp;start=50";
116+
final String xpathEx1 = ".//*[@id='search_results_table']/tbody/tr/td[1]/b/a/text()";
117+
final String xpathEx2 = ".//*[@id='search_results_table']/tbody/tr";
118+
119+
final DOMParser parser = new DOMParser();
120+
parser.setFeature("http://xml.org/sax/features/namespaces", false);
121+
parser.parse(url);
122+
final Document document = parser.getDocument();
123+
124+
final XPathFactory xpf = XPathFactory.newInstance();
125+
final XPath xpath = xpf.newXPath();
126+
final Object evaluate = xpath.evaluate(xpathEx2, document);
127+
128+
System.out.println(evaluate);
129+
}
130+
131+
}

0 commit comments

Comments
 (0)