The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
"Dynamic Web Crawling Tripadvisor"
Hello everyone,
I'm trying to get the reviews from tripadvisor to do a sentiment analysis. It works, but not in the way I want.
Here's my code on how to fetch html pages.
I expect a more dynamic process where I don't have to care about every hotel available in a certain city. I would like to have a process where I get, in the first step, all the available hotels and, more important, all the reviews of the hotels in a city.
This process just shows how to fetch html pages.
Thank you for your help!
Dominik
I'm trying to get the reviews from tripadvisor to do a sentiment analysis. It works, but not in the way I want.
Here's my code on how to fetch html pages.
There I have a lot of Web Crawl operators for each hotel where I want to get the reviews. Crawling rules need the latest review made for a certain hotel.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.4.000">
<context>
<input/>
<output>
<location>../../Data/Html-Pages-Tripadvisor</location>
</output>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.4.000" expanded="true" name="Process">
<parameter key="resultfile" value="C:\Users\Dominik\Documents\Studium\Projektarbeiten\Test_Data-Extraction\1_Fetch-Result-Example-Set.res"/>
<process expanded="true">
<operator activated="true" class="set_macro" compatibility="6.4.000" expanded="true" height="60" name="Set Macro" width="90" x="45" y="165">
<parameter key="macro" value="max-pages"/>
<parameter key="value" value="10"/>
<description align="center" color="transparent" colored="false" width="126">This macro defines the maximal number of pages fetched for each hotel in Are.</description>
</operator>
<operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="STF_Hotel_Are_Torg" width="90" x="246" y="75">
<parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d2400688-r154089743-STF_Hotel_Are_Torg-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*STF_Hotel_Are_Torg-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
<parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+STF_Hotel_Are_Torg-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
</list>
<parameter key="write_pages_into_files" value="false"/>
<parameter key="add_pages_as_attribute" value="true"/>
<parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
<parameter key="extension" value="html"/>
<parameter key="max_pages" value="%{max-pages}"/>
<parameter key="max_depth" value="100"/>
<parameter key="domain" value="server"/>
<parameter key="max_page_size" value="5000"/>
</operator>
<operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Are_Continental_Inn" width="90" x="246" y="165">
<parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d678441-r127124263-Are_Continental_Inn-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Are_Continental_Inn-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
<parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Are_Continental_Inn-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
</list>
<parameter key="write_pages_into_files" value="false"/>
<parameter key="add_pages_as_attribute" value="true"/>
<parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
<parameter key="extension" value="html"/>
<parameter key="max_pages" value="%{max-pages}"/>
<parameter key="max_depth" value="100"/>
<parameter key="domain" value="server"/>
<parameter key="max_page_size" value="5000"/>
</operator>
<operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Tott_Hotel_Are" width="90" x="246" y="255">
<parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d486763-r153044193-Tott_Hotel_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Tott_Hotel_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
<parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Tott_Hotel_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
</list>
<parameter key="write_pages_into_files" value="false"/>
<parameter key="add_pages_as_attribute" value="true"/>
<parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
<parameter key="extension" value="html"/>
<parameter key="max_pages" value="%{max-pages}"/>
<parameter key="max_depth" value="100"/>
<parameter key="domain" value="server"/>
<parameter key="max_page_size" value="5000"/>
</operator>
<operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Fjallgarden_Hotel" width="90" x="246" y="345">
<parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d565631-r137349978-Fjallgarden_Hotel-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Fjallgarden_Hotel-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
<parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Fjallgarden_Hotel-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
</list>
<parameter key="write_pages_into_files" value="false"/>
<parameter key="add_pages_as_attribute" value="true"/>
<parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
<parameter key="extension" value="html"/>
<parameter key="max_pages" value="%{max-pages}"/>
<parameter key="max_depth" value="100"/>
<parameter key="domain" value="server"/>
<parameter key="max_page_size" value="5000"/>
</operator>
<operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Hotel_Diplomat-Aregarden" width="90" x="246" y="435">
<parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d1157031-r148521338-Hotel_Diplomat_Aregarden-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Hotel_Diplomat_Aregarden-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
<parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Hotel_Diplomat_Aregarden-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
</list>
<parameter key="write_pages_into_files" value="false"/>
<parameter key="add_pages_as_attribute" value="true"/>
<parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
<parameter key="extension" value="html"/>
<parameter key="max_pages" value="%{max-pages}"/>
<parameter key="max_depth" value="100"/>
<parameter key="domain" value="server"/>
<parameter key="max_page_size" value="5000"/>
</operator>
<operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Holiday Club Are" width="90" x="246" y="525">
<parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d1016233-r152160882-Holiday_Club_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Holiday_Club_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
<parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Holiday_Club_Are-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
</list>
<parameter key="write_pages_into_files" value="false"/>
<parameter key="add_pages_as_attribute" value="true"/>
<parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
<parameter key="extension" value="html"/>
<parameter key="max_pages" value="%{max-pages}"/>
<parameter key="max_depth" value="100"/>
<parameter key="domain" value="server"/>
<parameter key="max_page_size" value="5000"/>
</operator>
<operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Cooperhil Mountain Lodge" width="90" x="246" y="615">
<parameter key="url" value="http://www.tripadvisor.co.uk/ShowUserReviews-g670155-d1236656-r153179437-Copperhill_Mountain_Lodge-Are_Jamtland_County_Jamtland_and_Harjedalen.html#CHECK_RATES_CONT"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".+/ShowUserReviews.*Copperhill_Mountain_Lodge-Are_Jamtland_County_Jamtland_and_Harjedalen.*#REVIEWS"/>
<parameter key="store_with_matching_url" value=".+tripadvisor.co.uk/ShowUserReviews.+Copperhill_Mountain_Lodge-Are_Jamtland_County_Jamtland_and_Harjedalen.*"/>
</list>
<parameter key="write_pages_into_files" value="false"/>
<parameter key="add_pages_as_attribute" value="true"/>
<parameter key="output_dir" value="C:\Wolfram\Dropbox\Wolfram\Research\BusinessIntelligence\Sentiment Analysis\1. Data Extraction\OriginalReviews"/>
<parameter key="extension" value="html"/>
<parameter key="max_pages" value="%{max-pages}"/>
<parameter key="max_depth" value="100"/>
<parameter key="domain" value="server"/>
<parameter key="max_page_size" value="5000"/>
</operator>
<operator activated="true" class="append" compatibility="6.4.000" expanded="true" height="184" name="Append" width="90" x="380" y="255"/>
<operator activated="true" class="web:retrieve_webpages" compatibility="5.3.002" expanded="true" height="60" name="Get Pages" width="90" x="514" y="255">
<parameter key="link_attribute" value="Link"/>
</operator>
<connect from_op="STF_Hotel_Are_Torg" from_port="Example Set" to_op="Append" to_port="example set 1"/>
<connect from_op="Are_Continental_Inn" from_port="Example Set" to_op="Append" to_port="example set 2"/>
<connect from_op="Tott_Hotel_Are" from_port="Example Set" to_op="Append" to_port="example set 3"/>
<connect from_op="Fjallgarden_Hotel" from_port="Example Set" to_op="Append" to_port="example set 4"/>
<connect from_op="Hotel_Diplomat-Aregarden" from_port="Example Set" to_op="Append" to_port="example set 5"/>
<connect from_op="Holiday Club Are" from_port="Example Set" to_op="Append" to_port="example set 6"/>
<connect from_op="Cooperhil Mountain Lodge" from_port="Example Set" to_op="Append" to_port="example set 7"/>
<connect from_op="Append" from_port="merged set" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
I expect a more dynamic process where I don't have to care about every hotel available in a certain city. I would like to have a process where I get, in the first step, all the available hotels and, more important, all the reviews of the hotels in a city.
This process just shows how to fetch html pages.
Thank you for your help!
Dominik
Tagged:
0