The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
Get Page operator stalls Rapidminer (SOLVED)
When running the process below (with web mining and text mining extensions loaded) RapidMiner stalls when trying to display the results. It eventually shows the results but something seems to be running in the background and it makes RapidMiner very sluggish.
I've been using this for years. Also tried version 10 and I'm experiencing the same issue.
Note: I wasn't allowed to post links which were in the XML code. To replicate just add 2 random links to the Get Page operator.
Any ideas?
I've been using this for years. Also tried version 10 and I'm experiencing the same issue.
Note: I wasn't allowed to post links which were in the XML code. To replicate just add 2 random links to the Get Page operator.
Any ideas?
<?xml version="1.0" encoding="UTF-8"?><process version="9.10.013">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="web:get_webpage" compatibility="9.7.002" expanded="true" height="68" name="Get Page" width="90" x="45" y="85">
<parameter key="url" value=""/>
<parameter key="random_user_agent" value="true"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="none"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<list key="query_parameters"/>
<list key="request_properties"/>
<parameter key="override_encoding" value="false"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="keep_sensitive_headers" value="false"/>
</operator>
<operator activated="true" class="web:get_webpage" compatibility="9.7.002" expanded="true" height="68" name="Get Page (2)" width="90" x="45" y="187">
<parameter key="url" value=""/>
<parameter key="random_user_agent" value="true"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="none"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<list key="query_parameters"/>
<list key="request_properties"/>
<parameter key="override_encoding" value="false"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="keep_sensitive_headers" value="false"/>
</operator>
<operator activated="false" breakpoints="after" class="read_excel" compatibility="6.0.003" expanded="true" height="68" name="Read Excel" width="90" x="112" y="289">
<parameter key="excel_file" value="C:/Users/hofma/Dropbox/ITB/2022 - 2023/Sem 1/Text Mining/Module Content/Session 6/S6 RapidMiner Files/daft_urls.xls"/>
<parameter key="sheet_selection" value="sheet number"/>
<parameter key="sheet_number" value="1"/>
<parameter key="imported_cell_range" value="A1:A80"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="read_all_values_as_polynominal" value="false"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="URLS.true.file_path.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="true"/>
</operator>
<operator activated="false" class="web:retrieve_webpages" compatibility="9.7.002" expanded="true" height="68" name="Get Pages" width="90" x="380" y="289">
<parameter key="link_attribute" value="URLS"/>
<parameter key="random_user_agent" value="false"/>
<parameter key="user_agent" value="RapidMiner"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="none"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<parameter key="delay" value="none"/>
<parameter key="delay_amount" value="1000"/>
<parameter key="min_delay_amount" value="0"/>
<parameter key="max_delay_amount" value="1000"/>
</operator>
<operator activated="false" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="289">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="false"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="select_attributes_and_weights" value="false"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:extract_information" compatibility="8.2.000" expanded="true" height="68" name="Extract Information (3)" width="90" x="313" y="34">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries">
<parameter key="test2" value="Department:\.*.\.*Institute Code:"/>
</list>
<parameter key="attribute_type" value="Nominal"/>
<list key="regular_expression_queries">
<parameter key="test" value="\bDepartment:\s+\K\S+"/>
</list>
<list key="regular_region_queries">
<parameter key="test" value="Department:</th><td>.</td></tr>"/>
</list>
<list key="xpath_queries">
<parameter key="Title" value="h:html/h:head/h:title/text()"/>
<parameter key="Price" value="//*[@id=&quot;__next"]/h:main/h:div[3]/h:div[1]/h:div[1]/h:div/h:div[3]/h:div[1]/h:span/text()"/>
<parameter key="Bed" value="//*[@data-testid=&quot;beds"]/text()"/>
<parameter key="DescriptionText" value="//*[@data-testid=&quot;description"]/text()"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="true"/>
<parameter key="assume_html" value="true"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
</operator>
<operator activated="true" class="web:extract_html_text_content" compatibility="9.7.002" expanded="true" height="68" name="Extract Content" width="90" x="581" y="34">
<parameter key="extract_content" value="true"/>
<parameter key="minimum_text_block_length" value="5"/>
<parameter key="override_content_type_information" value="true"/>
<parameter key="neglegt_span_tags" value="true"/>
<parameter key="neglect_p_tags" value="true"/>
<parameter key="neglect_b_tags" value="true"/>
<parameter key="neglect_i_tags" value="true"/>
<parameter key="neglect_br_tags" value="true"/>
<parameter key="ignore_non_html_tags" value="true"/>
</operator>
<connect from_port="document" to_op="Extract Information (3)" to_port="document"/>
<connect from_op="Extract Information (3)" from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:process_documents" compatibility="9.4.000" expanded="true" height="124" name="Process Documents" width="90" x="447" y="85">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="false"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="data_management" value="auto"/>
<process expanded="true">
<operator activated="true" class="text:extract_information" compatibility="8.2.000" expanded="true" height="68" name="Extract Information" width="90" x="179" y="34">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries">
<parameter key="test2" value="Department:\.*.\.*Institute Code:"/>
</list>
<parameter key="attribute_type" value="Nominal"/>
<list key="regular_expression_queries">
<parameter key="test" value="\bDepartment:\s+\K\S+"/>
</list>
<list key="regular_region_queries">
<parameter key="test" value="Department:</th><td>.</td></tr>"/>
</list>
<list key="xpath_queries">
<parameter key="Title" value="h:html/h:head/h:title/text()"/>
<parameter key="Price" value="//*[@id=&quot;__next"]/h:main/h:div[3]/h:div[1]/h:div[1]/h:div/h:div[3]/h:div[1]/h:span/text()"/>
<parameter key="Bed" value="//*[@data-testid=&quot;beds"]/text()"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="true"/>
<parameter key="assume_html" value="true"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Get Page" from_port="output" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Get Page (2)" from_port="output" to_op="Process Documents" to_port="documents 2"/>
<connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0
Answers
I tested your process with another regular expression because the URL is not included in your data.
Just changing the compatibility level into the Extract Information operator, the model run faster and more stable.
please try it.
Best,
Cesar
It makes a small difference but it still takes minutes to display the results from two web pages.
The issue was with the Document Vector creation as not producing a document vector resolved the issue. If you need a document vector of the HTML content, then a tokenizer will also eliminate the long wait time and unresponsiveness. In Rapidminer, when not using a tokenizer, the entire document is a token and RM seems to struggle to render this.
Thanks again
Markus