The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
Xpath and Strange Behaviour
Dear all,
i have been trying to use Extract Information with Xpath in Rapidminer, in order to extract the title and the content of an article.
Although Google Chrome provides an easy to use tool for obtaining the XPATH, rapidminer is extremely difficult to adjust when it comes to XPATH.
In the process below, i had to include 2 non obvious namespace IDs in order to extract what i needed. Could one please explain why is it so difficult to adopt standard Xpath approaches that run without problems in Google SpreadSheets or in Java and c# XML API?
Thank you in advance
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="web:process_web" compatibility="5.3.001" expanded="true" height="60" name="Process Documents from Web" width="90" x="45" y="300">
<parameter key="url" value="http://www.enikos.gr/sports/"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value=".+sports/.[0-9].+"/>
<parameter key="follow_link_with_matching_url" value=".+sports/.[0-9].+"/>
</list>
<parameter key="max_pages" value="20"/>
<parameter key="max_depth" value="1"/>
<parameter key="domain" value="server"/>
<parameter key="delay" value="2000"/>
<parameter key="max_threads" value="4"/>
<parameter key="max_page_size" value="1000"/>
<process expanded="true">
<operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information" width="90" x="179" y="165">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="title" value="//h:div[contains(@class,'title')]/h:h2/text()"/>
<parameter key="text" value="concat( //h:div[contains(@class,'text')]/h:p[1]/text(), //h:div[contains(@class,'text')]/h:p[2]/text(), //h:div[contains(@class,'text')]/h:p[3]/text(), //h:div[contains(@class,'text')]/h:p[4]/text(), //h:div[contains(@class,'text')]/h:p[5]/text(), //h:div[contains(@class,'text')]/h:p[6]/text() )"/>
</list>
<list key="namespaces">
<parameter key="title" value="<h2 xmlns="http://www.w3.org/1999/xhtml"></h2> "/>
<parameter key="text" value="<p xmlns="http://www.w3.org/1999/xhtml"></p> "/>
</list>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Process Documents from Web" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
i have been trying to use Extract Information with Xpath in Rapidminer, in order to extract the title and the content of an article.
Although Google Chrome provides an easy to use tool for obtaining the XPATH, rapidminer is extremely difficult to adjust when it comes to XPATH.
In the process below, i had to include 2 non obvious namespace IDs in order to extract what i needed. Could one please explain why is it so difficult to adopt standard Xpath approaches that run without problems in Google SpreadSheets or in Java and c# XML API?
Thank you in advance
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="web:process_web" compatibility="5.3.001" expanded="true" height="60" name="Process Documents from Web" width="90" x="45" y="300">
<parameter key="url" value="http://www.enikos.gr/sports/"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value=".+sports/.[0-9].+"/>
<parameter key="follow_link_with_matching_url" value=".+sports/.[0-9].+"/>
</list>
<parameter key="max_pages" value="20"/>
<parameter key="max_depth" value="1"/>
<parameter key="domain" value="server"/>
<parameter key="delay" value="2000"/>
<parameter key="max_threads" value="4"/>
<parameter key="max_page_size" value="1000"/>
<process expanded="true">
<operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information" width="90" x="179" y="165">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="title" value="//h:div[contains(@class,'title')]/h:h2/text()"/>
<parameter key="text" value="concat( //h:div[contains(@class,'text')]/h:p[1]/text(), //h:div[contains(@class,'text')]/h:p[2]/text(), //h:div[contains(@class,'text')]/h:p[3]/text(), //h:div[contains(@class,'text')]/h:p[4]/text(), //h:div[contains(@class,'text')]/h:p[5]/text(), //h:div[contains(@class,'text')]/h:p[6]/text() )"/>
</list>
<list key="namespaces">
<parameter key="title" value="<h2 xmlns="http://www.w3.org/1999/xhtml"></h2> "/>
<parameter key="text" value="<p xmlns="http://www.w3.org/1999/xhtml"></p> "/>
</list>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Process Documents from Web" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
0