The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
XPath multiple record extraction
Hello everyone,
I am trying to use "Extract Information" operator with XPath query type in order to duplicate
the video example: "Web Scraping with RapidMiner and XPath" from: http://vancouverdata.blogspot.com/2011/04/web-scraping-rapidminer-xpath-web.html
The techniq is working well if I am trying to extract one record with one or more atributes per page.
But if I try to extract an example all job links (http://vancouver.en.craigslist.ca/jjj/)
using the query: //h:blockquote/h:p[not(@align='center')]/h:a/@href
as was described on the tutorial (and is working on google spreadsheets)
"Extract Information" is returning just one random record instead of multiple records of all available job links from the page
I tryed to extend little bit the example by using "Crawl Web" operator in order to get the job links from more than one page
process them thru using macro aproach with 'Loop Examples' operator and finally to agregate the results in one final record set by using "Append" operator
but the system is failing for unexplained reasons.
here is the XML code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.011">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.011" expanded="true" name="Process">
<process expanded="true" height="548" width="883">
<operator activated="true" class="web:crawl_web" compatibility="5.1.004" expanded="true" height="60" name="Crawl Web (2)" width="90" x="45" y="165">
<parameter key="url" value="http://vancouver.en.craigslist.ca/jjj/"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".*index.*"/>
<parameter key="store_with_matching_url" value=".*index.*"/>
</list>
<parameter key="write_pages_into_files" value="false"/>
<parameter key="add_pages_as_attribute" value="true"/>
<parameter key="output_dir" value="C:\Users\Administrator\Documents\traRM"/>
<parameter key="max_pages" value="3"/>
<parameter key="max_depth" value="3"/>
<parameter key="obey_robot_exclusion" value="false"/>
<parameter key="really_ignore_exclusion" value="true"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.1.011" expanded="true" height="76" name="Generate ID" width="90" x="246" y="165"/>
<operator activated="true" class="loop_examples" compatibility="5.1.011" expanded="true" height="94" name="Loop Examples (2)" width="90" x="447" y="165">
<parameter key="iteration_macro" value="id"/>
<process expanded="true" height="548" width="901">
<operator activated="true" class="extract_macro" compatibility="5.1.011" expanded="true" height="60" name="Extract Macro (3)" width="90" x="246" y="30">
<parameter key="macro" value="website_url"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="Link"/>
<parameter key="example_index" value="%{id}"/>
</operator>
<operator activated="true" class="web:get_webpage" compatibility="5.1.004" expanded="true" height="60" name="Get Page" width="90" x="112" y="210">
<parameter key="url" value="%{website_url}"/>
<list key="query_parameters"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="5.1.002" expanded="true" height="94" name="Process Documents (2)" width="90" x="313" y="165">
<parameter key="create_word_vector" value="false"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="keep_text" value="true"/>
<process expanded="true" height="548" width="901">
<operator activated="true" class="text:extract_information" compatibility="5.1.002" expanded="true" height="60" name="Extract Information (4)" width="90" x="380" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="xpath" value="//h:blockquote/h:p[not(@align='center')]/h:a/@href"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information (4)" to_port="document"/>
<connect from_op="Extract Information (4)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_port="example set" to_op="Extract Macro (3)" to_port="example set"/>
<connect from_op="Extract Macro (3)" from_port="example set" to_port="example set"/>
<connect from_op="Get Page" from_port="output" to_op="Process Documents (2)" to_port="documents 1"/>
<connect from_op="Process Documents (2)" from_port="example set" to_port="output 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" breakpoints="before" class="append" compatibility="5.1.011" expanded="true" height="76" name="Append (2)" width="90" x="648" y="165"/>
<connect from_op="Crawl Web (2)" from_port="Example Set" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Loop Examples (2)" to_port="example set"/>
<connect from_op="Loop Examples (2)" from_port="output 1" to_op="Append (2)" to_port="example set 1"/>
<connect from_op="Append (2)" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Could you help me to solve this problem please
Thanks
I am trying to use "Extract Information" operator with XPath query type in order to duplicate
the video example: "Web Scraping with RapidMiner and XPath" from: http://vancouverdata.blogspot.com/2011/04/web-scraping-rapidminer-xpath-web.html
The techniq is working well if I am trying to extract one record with one or more atributes per page.
But if I try to extract an example all job links (http://vancouver.en.craigslist.ca/jjj/)
using the query: //h:blockquote/h:p[not(@align='center')]/h:a/@href
as was described on the tutorial (and is working on google spreadsheets)
"Extract Information" is returning just one random record instead of multiple records of all available job links from the page
I tryed to extend little bit the example by using "Crawl Web" operator in order to get the job links from more than one page
process them thru using macro aproach with 'Loop Examples' operator and finally to agregate the results in one final record set by using "Append" operator
but the system is failing for unexplained reasons.
here is the XML code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.011">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.011" expanded="true" name="Process">
<process expanded="true" height="548" width="883">
<operator activated="true" class="web:crawl_web" compatibility="5.1.004" expanded="true" height="60" name="Crawl Web (2)" width="90" x="45" y="165">
<parameter key="url" value="http://vancouver.en.craigslist.ca/jjj/"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".*index.*"/>
<parameter key="store_with_matching_url" value=".*index.*"/>
</list>
<parameter key="write_pages_into_files" value="false"/>
<parameter key="add_pages_as_attribute" value="true"/>
<parameter key="output_dir" value="C:\Users\Administrator\Documents\traRM"/>
<parameter key="max_pages" value="3"/>
<parameter key="max_depth" value="3"/>
<parameter key="obey_robot_exclusion" value="false"/>
<parameter key="really_ignore_exclusion" value="true"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.1.011" expanded="true" height="76" name="Generate ID" width="90" x="246" y="165"/>
<operator activated="true" class="loop_examples" compatibility="5.1.011" expanded="true" height="94" name="Loop Examples (2)" width="90" x="447" y="165">
<parameter key="iteration_macro" value="id"/>
<process expanded="true" height="548" width="901">
<operator activated="true" class="extract_macro" compatibility="5.1.011" expanded="true" height="60" name="Extract Macro (3)" width="90" x="246" y="30">
<parameter key="macro" value="website_url"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="Link"/>
<parameter key="example_index" value="%{id}"/>
</operator>
<operator activated="true" class="web:get_webpage" compatibility="5.1.004" expanded="true" height="60" name="Get Page" width="90" x="112" y="210">
<parameter key="url" value="%{website_url}"/>
<list key="query_parameters"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="5.1.002" expanded="true" height="94" name="Process Documents (2)" width="90" x="313" y="165">
<parameter key="create_word_vector" value="false"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="keep_text" value="true"/>
<process expanded="true" height="548" width="901">
<operator activated="true" class="text:extract_information" compatibility="5.1.002" expanded="true" height="60" name="Extract Information (4)" width="90" x="380" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="xpath" value="//h:blockquote/h:p[not(@align='center')]/h:a/@href"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information (4)" to_port="document"/>
<connect from_op="Extract Information (4)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_port="example set" to_op="Extract Macro (3)" to_port="example set"/>
<connect from_op="Extract Macro (3)" from_port="example set" to_port="example set"/>
<connect from_op="Get Page" from_port="output" to_op="Process Documents (2)" to_port="documents 1"/>
<connect from_op="Process Documents (2)" from_port="example set" to_port="output 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" breakpoints="before" class="append" compatibility="5.1.011" expanded="true" height="76" name="Append (2)" width="90" x="648" y="165"/>
<connect from_op="Crawl Web (2)" from_port="Example Set" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Loop Examples (2)" to_port="example set"/>
<connect from_op="Loop Examples (2)" from_port="output 1" to_op="Append (2)" to_port="example set 1"/>
<connect from_op="Append (2)" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Could you help me to solve this problem please
Thanks
0
Answers
You'll have to use Cut Document to be able to extract multiple similar stuff from one document. Do a search on this forum and you'll fine some example. I start building my process based on this and it works pretty good.
Good luck.
Thank you very much for advise to use "Cut Document" it is working excellent for me.
for anybody experiencing same problem here is solution: