[SOLVED] Web Crawling - Process document from web returns only first record

Boggsy · December 2012

Hi, Im hoping someone can help me out with a web crawling question. I'm using the process documents from web and extract information operators and the XPATH queries are returning the information that I need which I can export to CSV, the issue is that only the first record is output but I need all forum data from the 10 URL's to be output. I've seen a few posts where people are using loop examples and the cut document operator but I can't seem to get this working right - Has anyone come up with a novel approach to doing this?
Thanks

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
    <process expanded="true" height="415" width="748">
      <operator activated="true" class="web:process_web" compatibility="5.2.003" expanded="true" height="60" name="Process Documents from Web" width="90" x="45" y="30">
        <parameter key="url" value="http://www.airlinequality.com/Forum/ryan.htm"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".+Forum/ryan.+"/>
        </list>
        <parameter key="max_pages" value="10"/>
        <parameter key="max_depth" value="100"/>
        <parameter key="delay" value="500"/>
        <parameter key="max_threads" value="10"/>
        <parameter key="max_page_size" value="5000"/>
        <parameter key="user_agent" value="Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"/>
        <process expanded="true" height="385" width="762">
          <operator activated="true" class="text:extract_information" compatibility="5.2.004" expanded="true" height="60" name="Extract Information (2)" width="90" x="45" y="30">
            <parameter key="query_type" value="XPath"/>
            <list key="string_machting_queries"/>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries"/>
            <list key="xpath_queries">
              <parameter key="review" value="//h:p[@class='text2']/text()"/>
              <parameter key="rating" value="//h:p[@class='text25' and contains(., 'Rating')]/text()"/>
              <parameter key="recommended" value="//h:img[contains(@src,'_rvw.gif')]/@src"/&gt;
              <parameter key="value_for_money" value="//h:table[starts-with(@width,'193')]//h:tr[3]//h:td[2]//h:img/@src"/&gt;
              <parameter key="reviewed_by" value="//h:td[@class='airport']/h:h9/text()"/>
              <parameter key="seat_comfort" value="//h:table[starts-with(@width,'193')]//h:tr[4]//h:td[2]//h:img/@src"/&gt;
              <parameter key="staff_service" value="//h:table[starts-with(@width,'193')]//h:tr[5]//h:td[2]//h:img/@src"/&gt;
              <parameter key="catering" value="//h:table[starts-with(@width,'193')]//h:tr[6]//h:td[2]//h:img/@src"/&gt;
            </list>
            <list key="namespaces"/>
            <list key="index_queries"/>
          </operator>
          <connect from_port="document" to_op="Extract Information (2)" to_port="document"/>
          <connect from_op="Extract Information (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Process Documents from Web" from_port="example set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
[ /code]

MariusHelf · December 2012

Hi,

actually it is expected behaviour that only the first match of the XPath is used. So far there is no novel approach, so instead of Extract Information, you will have to use Cut Document instead (and e.g. add Extract Information in its subprocess).

Best regards,
Marius

Boggsy · December 2012

Thanks a lot Marius, thats worked perfectly!

 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
    <process expanded="true" height="367" width="758">
      <operator activated="true" class="web:process_web" compatibility="5.2.003" expanded="true" height="60" name="Ryanair" width="90" x="45" y="30">
        <parameter key="url" value="http://www.airlinequality.com/Forum/ryan.htm"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".+Forum/ryan.+"/>
        </list>
        <parameter key="max_pages" value="10"/>
        <parameter key="max_depth" value="100"/>
        <parameter key="delay" value="500"/>
        <parameter key="max_threads" value="10"/>
        <parameter key="max_page_size" value="5000"/>
        <parameter key="user_agent" value="Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"/>
        <process expanded="true" height="385" width="776">
          <operator activated="true" class="text:cut_document" compatibility="5.2.004" expanded="true" height="60" name="Cut Document" width="90" x="45" y="30">
            <parameter key="query_type" value="XPath"/>
            <list key="string_machting_queries"/>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries"/>
            <list key="xpath_queries">
              <parameter key="review_data" value="//h:table[starts-with(@width,'751')]"/>
            </list>
            <list key="namespaces"/>
            <list key="index_queries"/>
            <process expanded="true" height="385" width="776">
              <operator activated="true" class="text:extract_information" compatibility="5.2.004" expanded="true" height="60" name="Extract Information (3)" width="90" x="45" y="30">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries"/>
                <list key="xpath_queries">
                  <parameter key="review" value="//h:p[@class='text2']/text()"/>
                  <parameter key="rating" value="//h:p[@class='text25' and contains(., 'Rating')]/text()"/>
                  <parameter key="recommended" value="//h:img[contains(@src,'_rvw.gif')]/@src"/&gt;
                  <parameter key="value_for_money" value="//h:table[starts-with(@width,'193')]//h:tr[3]//h:td[2]//h:img/@src"/&gt;
                  <parameter key="reviewed_by" value="//h:td[@class='airport']/h:h9/text()"/>
                  <parameter key="seat_comfort" value="//h:table[starts-with(@width,'193')]//h:tr[4]//h:td[2]//h:img/@src"/&gt;
                  <parameter key="staff_service" value="//h:table[starts-with(@width,'193')]//h:tr[5]//h:td[2]//h:img/@src"/&gt;
                  <parameter key="catering" value="//h:table[starts-with(@width,'193')]//h:tr[6]//h:td[2]//h:img/@src"/&gt;
                  <parameter key="class" value="//h:table[starts-with(@width,'193')]//h:tr[2]//h:td[2]//h:p[@class='text25']/text()"/>
                </list>
                <list key="namespaces"/>
                <list key="index_queries"/>
              </operator>
              <connect from_port="segment" to_op="Extract Information (3)" to_port="document"/>
              <connect from_op="Extract Information (3)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_segment" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="document" to_op="Cut Document" to_port="document"/>
          <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Ryanair" from_port="example set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

[SOLVED] Web Crawling - Process document from web returns only first record

Answers