The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
Process documents from data generates 2 times the data
Hi all,
I'm new with rapidminer and tried to read rss feeds and put it in some files like this tutorial on vimeo (https://vimeo.com/62963128). Almost everything works, but I have one problem. The process generates every file twice. File 1 and 21 are the same file, file 2 and 22 are the same file, and so on. Does anybody knows what I'm doing wrong?
Regards,
Dave
I'm new with rapidminer and tried to read rss feeds and put it in some files like this tutorial on vimeo (https://vimeo.com/62963128). Almost everything works, but I have one problem. The process generates every file twice. File 1 and 21 are the same file, file 2 and 22 are the same file, and so on. Does anybody knows what I'm doing wrong?
<?xml version="1.0" encoding="UTF-8" standalone="no"?>Thanks in advance.
<process version="6.5.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.5.002" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="web:read_rss" compatibility="6.5.000" expanded="true" height="60" name="Read RSS Feed" width="90" x="112" y="255">
<parameter key="url" value="http://www.autoblog.com/category/recap/rss.xml"/>
<parameter key="random_user_agent" value="true"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="6.5.000" expanded="true" height="60" name="Get Pages" width="90" x="514" y="435">
<parameter key="link_attribute" value="Link"/>
<parameter key="page_attribute" value="myhtml"/>
<parameter key="random_user_agent" value="false"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="none"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<parameter key="delay" value="none"/>
<parameter key="delay_amount" value="1000"/>
<parameter key="min_delay_amount" value="0"/>
<parameter key="max_delay_amount" value="1000"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="6.5.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="715" y="345">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="false"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="select_attributes_and_weights" value="false"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="6.5.000" expanded="true" height="60" name="Extract Content" width="90" x="112" y="30">
<parameter key="extract_content" value="true"/>
<parameter key="minimum_text_block_length" value="5"/>
<parameter key="override_content_type_information" value="true"/>
<parameter key="neglegt_span_tags" value="true"/>
<parameter key="neglect_p_tags" value="true"/>
<parameter key="neglect_b_tags" value="true"/>
<parameter key="neglect_i_tags" value="true"/>
<parameter key="neglect_br_tags" value="true"/>
<parameter key="ignore_non_html_tags" value="true"/>
</operator>
<operator activated="true" class="web:unescape_html" compatibility="6.5.000" expanded="true" height="60" name="Unescape HTML Document" width="90" x="313" y="30"/>
<operator activated="true" class="text:write_document" compatibility="6.5.000" expanded="true" height="76" name="Write Document" width="90" x="514" y="30">
<parameter key="overwrite" value="true"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
<operator activated="true" class="write_file" compatibility="6.5.002" expanded="true" height="60" name="Write File" width="90" x="715" y="165">
<parameter key="resource_type" value="file"/>
<parameter key="filename" value="d:\test\%{t}-%{a}.txt"/>
<parameter key="mime_type" value="application/octet-stream"/>
</operator>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Unescape HTML Document" to_port="document"/>
<connect from_op="Unescape HTML Document" from_port="document" to_op="Write Document" to_port="document"/>
<connect from_op="Write Document" from_port="document" to_port="document 1"/>
<connect from_op="Write Document" from_port="file" to_op="Write File" to_port="file"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read RSS Feed" from_port="output" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Regards,
Dave
Tagged:
0