The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
XPath empty results
Hello. I 'm trying to mine data using XPath from Google Scholar pages.
I ' trying to get the name ,h-index and the first 20 publications
I am using the following queries
substring-before(//title, " - Google Scholar Citations")
//*[contains(.,"h-index")]/../tr[3]//td[2]
//a[contains(@href,'citation_for_view')]
All of them work in Google Docs and in Java but none of them does in Rapidminer.
I can't figure out what's wrong...
I ' trying to get the name ,h-index and the first 20 publications
I am using the following queries
substring-before(//title, " - Google Scholar Citations")
//*[contains(.,"h-index")]/../tr[3]//td[2]
//a[contains(@href,'citation_for_view')]
All of them work in Google Docs and in Java but none of them does in Rapidminer.
I can't figure out what's wrong...
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
<process expanded="true">
<operator activated="false" class="web:crawl_web" compatibility="5.3.001" expanded="true" height="60" name="Crawl Web" width="90" x="112" y="30">
<parameter key="url" value="http://scholar.google.gr/citations?view_op=search_authors&hl=el&mauthors=label:web_mining"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".+user=.+"/>
<parameter key="follow_link_with_matching_url" value=".+8J&astart=.+"/>
</list>
<parameter key="output_dir" value="/tmp"/>
<parameter key="extension" value="html"/>
<parameter key="max_pages" value="5000"/>
<parameter key="max_depth" value="1"/>
<parameter key="max_threads" value="2"/>
<parameter key="max_page_size" value="300"/>
<parameter key="user_agent" value=" Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0"/>
</operator>
<operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files" width="90" x="112" y="165">
<list key="text_directories">
<parameter key="all" value="/home/phoenix/DataMine/SkolarCrawl"/>
</list>
<parameter key="use_file_extension_as_type" value="false"/>
<parameter key="content_type" value="html"/>
<parameter key="create_word_vector" value="false"/>
<process expanded="true">
<operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information" width="90" x="45" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Name" value="substring-before(//title, " - Google Scholar Citations")"/>
<parameter key="hindex" value="//*[contains(.,"h-index")]/../tr[3]//td[2]"/>
<parameter key="Publications" value="//a[contains(@href,'citation_for_view')]"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
0
Answers
with "Extract Information". This approach seems to be better in your case. Please check and take into account the use of nested processes.