The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
[Solved]xpath function in RapidMiner: info from multiple grandparents
Kate_Strydom
Member Posts: 19 Contributor II
Hi,
I am in the process of learning to use the web crawler operator and then turning the html/txt file extracted into a database. Currently, I am battling to obtain more than one grandparent node in my RapidMiner output. For example I am wanting to extract all the shop names under the
//h:div/h:ul/h:li[position()]/h:a/text(), the node li is the one that changes and the number of li nodes changes per document, that is, is n. I just don't seem to be able to work out how to get all the grandparents from my file. Only the first one appears, unless I change it to position()=3, then I get only the third ones.
I would appreciate it if someone could share their knowledge with me on how to achieve my objective.
Kate
I am in the process of learning to use the web crawler operator and then turning the html/txt file extracted into a database. Currently, I am battling to obtain more than one grandparent node in my RapidMiner output. For example I am wanting to extract all the shop names under the
//h:div/h:ul/h:li[position()]/h:a/text(), the node li is the one that changes and the number of li nodes changes per document, that is, is n. I just don't seem to be able to work out how to get all the grandparents from my file. Only the first one appears, unless I change it to position()=3, then I get only the third ones.
I would appreciate it if someone could share their knowledge with me on how to achieve my objective.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>Many thanks.
<process version="5.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.0.11" expanded="true" name="Process">
<process expanded="true" height="415" width="685">
<operator activated="true" class="web:get_webpage" compatibility="5.0.4" expanded="true" height="60" name="Get Page" width="90" x="45" y="255">
<parameter key="url" value="http://forum.spiegel.de/showthread.php?t=22981&page=6"/>
<list key="query_parameters"/>
</operator>
<operator activated="true" class="text:cut_document" compatibility="5.0.7" expanded="true" height="60" name="Cut Document" width="90" x="313" y="120">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Segmenter" value="/h:html/h:body/h:div[4]/h:div[1]/h:div[2]/h:div[2]/h:div[2]/h:div/h:div/h:div/h:div/h:table"/>
</list>
<list key="namespaces">
<parameter key="xx" value="xml"/>
</list>
<parameter key="ignore_CDATA" value="false"/>
<list key="index_queries"/>
<process expanded="true" height="499" width="750">
<operator activated="true" class="text:remove_document_parts" compatibility="5.0.7" expanded="true" height="60" name="Remove Document Parts" width="90" x="112" y="75">
<parameter key="deletion_regex" value="(<br clear="none" />)"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.0.11" expanded="true" height="94" name="Multiply" width="90" x="279" y="97"/>
<operator activated="true" class="text:cut_document" compatibility="5.0.7" expanded="true" height="60" name="Cut Document (2)" width="90" x="447" y="120">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Zitate" value="//h:div[@style='font-style:italic']/text()"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="false"/>
<list key="index_queries"/>
<process expanded="true" height="499" width="750">
<connect from_port="segment" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:cut_document" compatibility="5.0.7" expanded="true" height="60" name="Cut Document (3)" width="90" x="447" y="255">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Posting" value="//h:table/h:tr[2]/h:td[2]/h:div[2]/text()[2]|/h:table/h:tbody/h:tr[2]/h:td[2]/h:div[2]/text()"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="false"/>
<list key="index_queries"/>
<process expanded="true" height="499" width="750">
<connect from_port="segment" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_port="segment" to_op="Remove Document Parts" to_port="document"/>
<connect from_op="Remove Document Parts" from_port="document" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Cut Document (2)" to_port="document"/>
<connect from_op="Multiply" from_port="output 2" to_op="Cut Document (3)" to_port="document"/>
<connect from_op="Cut Document (2)" from_port="documents" to_port="document 1"/>
<connect from_op="Cut Document (3)" from_port="documents" to_port="document 2"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
<portSpacing port="sink_document 3" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="5.0.7" expanded="true" height="76" name="Documents to Data" width="90" x="581" y="120">
<parameter key="text_attribute" value="Testattr"/>
<parameter key="label_attribute" value="testattribut"/>
</operator>
<connect from_op="Get Page" from_port="output" to_op="Cut Document" to_port="document"/>
<connect from_op="Cut Document" from_port="documents" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Kate
0
Answers