The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
How to use XPath for extracting multiple review data from a single webpage
subhasisdasgupt
Member Posts: 15 Contributor II
I am new to XPath but I need to extract mutiple reviews from a single webpage. My objective is to extract reviewer's name, date of review, ratings and the entire review text. Each reviewer should be a separate record in my example set. Is there any way to do that. I was working with review pages of epinion.com.
0
Answers
Best regards,
Marius
Best regards,
Marius
I am attaching the XML code so that you can have a look. I am unable to save the output in a proper .csv format so that later I can import the same data. I want to do some analysis on the customer reviews. But the "Review" attribute is not appearing in the "Select Attribute" node and neither appearing in the "Set Role" node once connected.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.005">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
<list key="text_directories">
<parameter key="S3" value="D:\Web_crawl_S3"/>
<parameter key="Advance S" value="D:\Web Data 1"/>
</list>
<parameter key="extract_text_only" value="false"/>
<parameter key="create_word_vector" value="false"/>
<process expanded="true">
<operator activated="true" class="text:cut_document" compatibility="5.3.000" expanded="true" height="60" name="Cut Document (2)" width="90" x="246" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Start" value="//h:div[@class='fclear fk-review fk-position-relative line']"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
<process expanded="true">
<operator activated="true" class="text:remove_document_parts" compatibility="5.3.000" expanded="true" height="60" name="Remove Document Parts (2)" width="90" x="112" y="30">
<parameter key="deletion_regex" value="(<br clear="none" />)"/>
</operator>
<operator activated="true" class="text:extract_information" compatibility="5.3.000" expanded="true" height="60" name="Extract Information (5)" width="90" x="246" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Reviewer" value="//h:a[@profile_name]/text()"/>
<parameter key="Review date" value="//h:div[@class='date line fk-font-small']/text()"/>
<parameter key="Rating" value="//h:div[@class='fk-stars-small']/@title"/>
<parameter key="Review" value="//h:p[@class='line bmargin10']/text()"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="segment" to_op="Remove Document Parts (2)" to_port="document"/>
<connect from_op="Remove Document Parts (2)" from_port="document" to_op="Extract Information (5)" to_port="document"/>
<connect from_op="Extract Information (5)" from_port="document" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_port="document" to_op="Cut Document (2)" to_port="document"/>
<connect from_op="Cut Document (2)" from_port="documents" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Can you help me in this regard?
Best regards,
Marius