How to create a process to already prepared data (NLP, NN)

Maja · September 2019

Hi!
I faced a huge problem in Rapidminer. So basically I have a dataset of word vectors which are categorized to 5 labels. I'd like to train a neural network how to associate proper combinations (in this case - appearance of words in sentences) to one of the 5 labels. The problem is that I do not know how to design an useful process. I've watched a lot of tutorials for Rapidminer, but unfortunately there's not enough similarities to my project.

Could you please help me? This is my engineer thesis and I'd really love to finish it properly

))

Thanks in advance!

PS - 9.csv file is an output, arxiv(...).txt file is an example input.

kayman · September 2019

If I get it right wordlist to data is what you need, I used your input to create similar output and this is what I got :

Image: https://us.v-cdn.net/6030995/uploads/editor/7c/7y7kdmjv8gjj.png

This is the simplified process :

<?xml version="1.0" encoding="UTF-8"?><process version="9.3.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="UTF-8"/>
    <process expanded="true">
      <operator activated="true" class="read_csv" compatibility="9.3.001" expanded="true" height="68" name="Read CSV" width="90" x="112" y="34">
        <parameter key="csv_file" value="C:\Users\yourfolder\Downloads\arxiv_annotate3_80_1.txt"/>
        <parameter key="column_separators" value="&#9;"/>
        <parameter key="trim_lines" value="false"/>
        <parameter key="use_quotes" value="true"/>
        <parameter key="quotes_character" value="&quot;"/>
        <parameter key="escape_character" value="\"/>
        <parameter key="skip_comments" value="true"/>
        <parameter key="comment_characters" value="#"/>
        <parameter key="starting_row" value="1"/>
        <parameter key="parse_numbers" value="true"/>
        <parameter key="decimal_character" value="."/>
        <parameter key="grouped_digits" value="false"/>
        <parameter key="grouping_character" value=","/>
        <parameter key="infinity_representation" value=""/>
        <parameter key="date_format" value=""/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations"/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="English (United States)"/>
        <parameter key="encoding" value="UTF-8"/>
        <parameter key="read_all_values_as_polynominal" value="false"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="label.true.polynominal.label"/>
          <parameter key="1" value="txt.true.text.attribute"/>
        </list>
        <parameter key="read_not_matching_values_as_missings" value="false"/>
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="8.2.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="246" y="34">
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="Binary Term Occurrences"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="false"/>
        <parameter key="prune_method" value="none"/>
        <parameter key="prune_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.95"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="data_management" value="auto"/>
        <parameter key="select_attributes_and_weights" value="false"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="8.2.000" expanded="true" height="68" name="Tokenize" width="90" x="246" y="34">
            <parameter key="mode" value="regular expression"/>
            <parameter key="characters" value=".:"/>
            <parameter key="expression" value="\s"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:wordlist_to_data" compatibility="8.2.000" expanded="true" height="82" name="WordList to Data" width="90" x="380" y="85"/>
      <connect from_op="Read CSV" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
      <connect from_op="WordList to Data" from_port="example set" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Maja · September 2019

Hey!
Thank you for your help! But this process doesn't avoid special words like "CITATION", "NUMBER" and I see that it shows "al" as a word, which is an error. Is there any possibility to work with code in Python?
The program should determine if the neural network assigns a sentence to a correct label. So I would need a loop which already goes through the preprocessed output (process is written in Python) and then it should learn that certain combination of words go with certain labels... Is it even possible to do it?

I'd really appreciate your help. It feels like I went through a milion of websites and I do not see any answer.

Thanks in advance

kayman · September 2019

Hi @Maja, You can always improve the process within the 'process documents' container. This is where all the logic goes.
You could for instance pretty easily filter the special words as they are the only ones in capitals. The filter tokens can do that for you, just filter out all words in capitals etc.

The fact 'al' shows up is because it was in your original text, it's mentioned in 'sandroni et al ', so the fact it shows up is correct. You can filter these out again by creating your own dictionary.

Note that this process is actually trying to do the same thing as what your output (9.csv) shows, simply generating a vectorset from a document, so to get the same results the same cleaning and optimizing needs to be done as in your python workflow.

If I understand it correctly however, this is what you already have, and you need to get the keywords by label from this file (9.csv), correct?

Attached example shows the 'improved' processes, one where I created 9.csv alike from your source (arxiv) and one where I use your 9.csv to create a wordlist by label. The results are pretty similar so this might work for you also. But I have to admit I'm not really sure what you like to achieve so it could be utterly useless also

You can indeed work in python without any problem, the python operator allows you to work directly with python libraries, as long as your dataset is 'pandas friendly' as this is the default way Rapidminer communicates with python. But if you can do it in RM you may not need the python part at all.

<?xml version="1.0" encoding="UTF-8"?><process version="9.3.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="UTF-8"/>
    <process expanded="true">
      <operator activated="true" class="read_csv" compatibility="9.3.001" expanded="true" height="68" name="Read CSV" width="90" x="112" y="34">
        <parameter key="csv_file" value="C:\Users\bewoutek\Downloads\arxiv_annotate3_80_1.txt"/>
        <parameter key="column_separators" value="&#9;"/>
        <parameter key="trim_lines" value="false"/>
        <parameter key="use_quotes" value="true"/>
        <parameter key="quotes_character" value="&quot;"/>
        <parameter key="escape_character" value="\"/>
        <parameter key="skip_comments" value="true"/>
        <parameter key="comment_characters" value="#"/>
        <parameter key="starting_row" value="1"/>
        <parameter key="parse_numbers" value="true"/>
        <parameter key="decimal_character" value="."/>
        <parameter key="grouped_digits" value="false"/>
        <parameter key="grouping_character" value=","/>
        <parameter key="infinity_representation" value=""/>
        <parameter key="date_format" value=""/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations"/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="English (United States)"/>
        <parameter key="encoding" value="UTF-8"/>
        <parameter key="read_all_values_as_polynominal" value="false"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="label.true.polynominal.label"/>
          <parameter key="1" value="txt.true.text.attribute"/>
        </list>
        <parameter key="read_not_matching_values_as_missings" value="false"/>
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
        <description align="center" color="transparent" colored="false" width="126">your raw data, adjust the location</description>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="8.2.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="246" y="34">
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="Binary Term Occurrences"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="false"/>
        <parameter key="prune_method" value="none"/>
        <parameter key="prune_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.95"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="data_management" value="auto"/>
        <parameter key="select_attributes_and_weights" value="false"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="8.2.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
            <parameter key="mode" value="regular expression"/>
            <parameter key="characters" value=".:"/>
            <parameter key="expression" value="\s"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
            <description align="center" color="transparent" colored="false" width="126">split by word</description>
          </operator>
          <operator activated="true" class="text:filter_by_length" compatibility="8.2.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="179" y="34">
            <parameter key="min_chars" value="1"/>
            <parameter key="max_chars" value="50"/>
          </operator>
          <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.2.000" expanded="true" height="68" name="Filter Tokens (by Content)" width="90" x="313" y="34">
            <parameter key="condition" value="matches"/>
            <parameter key="regular_expression" value="^[a-z]+$"/>
            <parameter key="case_sensitive" value="true"/>
            <parameter key="invert condition" value="false"/>
            <description align="center" color="transparent" colored="false" width="126">remove special tokens</description>
          </operator>
          <operator activated="true" class="text:create_document" compatibility="8.2.000" expanded="true" height="68" name="Create Document" width="90" x="246" y="187">
            <parameter key="text" value="et&#10;al"/>
            <parameter key="add label" value="false"/>
            <parameter key="label_type" value="nominal"/>
          </operator>
          <operator activated="true" class="text:write_document" compatibility="8.2.000" expanded="true" height="82" name="Write Document" width="90" x="380" y="187">
            <parameter key="overwrite" value="true"/>
            <parameter key="encoding" value="UTF-8"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="8.2.000" expanded="true" height="82" name="Filter Stopwords (Dictionary)" width="90" x="514" y="34">
            <parameter key="case_sensitive" value="false"/>
            <parameter key="encoding" value="UTF-8"/>
          </operator>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
          <connect from_op="Filter Tokens (by Content)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
          <connect from_op="Create Document" from_port="output" to_op="Write Document" to_port="document"/>
          <connect from_op="Write Document" from_port="file" to_op="Filter Stopwords (Dictionary)" to_port="file"/>
          <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:wordlist_to_data" compatibility="8.2.000" expanded="true" height="82" name="WordList to Data" width="90" x="380" y="238"/>
      <operator activated="true" class="rename_by_replacing" compatibility="9.3.001" expanded="true" height="82" name="Rename by Replacing (2)" width="90" x="514" y="238">
        <parameter key="attribute_filter_type" value="all"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="replace_what" value="in class \((.*)\)"/>
        <parameter key="replace_by" value="$1"/>
        <description align="center" color="transparent" colored="false" width="126">some label cleanup</description>
      </operator>
      <operator activated="true" class="read_csv" compatibility="9.3.001" expanded="true" height="68" name="Read CSV (2)" width="90" x="112" y="442">
        <parameter key="csv_file" value="C:\Users\bewoutek\Downloads\9.csv"/>
        <parameter key="column_separators" value=","/>
        <parameter key="trim_lines" value="false"/>
        <parameter key="use_quotes" value="true"/>
        <parameter key="quotes_character" value="&quot;"/>
        <parameter key="escape_character" value="\"/>
        <parameter key="skip_comments" value="false"/>
        <parameter key="comment_characters" value="#"/>
        <parameter key="starting_row" value="1"/>
        <parameter key="parse_numbers" value="true"/>
        <parameter key="decimal_character" value="."/>
        <parameter key="grouped_digits" value="false"/>
        <parameter key="grouping_character" value=","/>
        <parameter key="infinity_representation" value=""/>
        <parameter key="date_format" value=""/>
        <parameter key="first_row_as_names" value="true"/>
        <list key="annotations"/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="English (United States)"/>
        <parameter key="encoding" value="UTF-8"/>
        <parameter key="read_all_values_as_polynominal" value="false"/>
        <list key="data_set_meta_data_information"/>
        <parameter key="read_not_matching_values_as_missings" value="true"/>
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
        <description align="center" color="transparent" colored="false" width="126">your output, adjust the location</description>
      </operator>
      <operator activated="true" class="set_role" compatibility="9.3.001" expanded="true" height="82" name="Set Role" width="90" x="246" y="442">
        <parameter key="attribute_name" value="label"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="9.3.001" expanded="true" height="103" name="Multiply" width="90" x="380" y="544"/>
      <operator activated="true" class="subprocess" compatibility="9.3.001" expanded="true" height="82" name="Subprocess" width="90" x="782" y="646">
        <process expanded="true">
          <operator activated="true" class="concurrency:loop_values" compatibility="9.3.001" expanded="true" height="82" name="Loop Values" width="90" x="45" y="34">
            <parameter key="attribute" value="label"/>
            <parameter key="iteration_macro" value="lbl"/>
            <parameter key="reuse_results" value="false"/>
            <parameter key="enable_parallel_execution" value="true"/>
            <process expanded="true">
              <operator activated="true" class="filter_examples" compatibility="9.3.001" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="34">
                <parameter key="parameter_expression" value=""/>
                <parameter key="condition_class" value="custom_filters"/>
                <parameter key="invert_filter" value="false"/>
                <list key="filters_list">
                  <parameter key="filters_entry_key" value="label.contains.%{lbl}"/>
                </list>
                <parameter key="filters_logic_and" value="true"/>
                <parameter key="filters_check_metadata" value="true"/>
                <description align="center" color="transparent" colored="false" width="126">filter on current label</description>
              </operator>
              <operator activated="true" class="concurrency:loop_attributes" compatibility="9.3.001" expanded="true" height="82" name="Loop Attributes" width="90" x="179" y="34">
                <parameter key="attribute_filter_type" value="all"/>
                <parameter key="attribute" value=""/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="attribute_value"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="time"/>
                <parameter key="block_type" value="attribute_block"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="value_matrix_row_start"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
                <parameter key="attribute_name_macro" value="ca"/>
                <parameter key="reuse_results" value="false"/>
                <parameter key="enable_parallel_execution" value="true"/>
                <process expanded="true">
                  <operator activated="true" class="select_attributes" compatibility="9.3.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="34">
                    <parameter key="attribute_filter_type" value="single"/>
                    <parameter key="attribute" value="%{ca}"/>
                    <parameter key="attributes" value=""/>
                    <parameter key="use_except_expression" value="false"/>
                    <parameter key="value_type" value="attribute_value"/>
                    <parameter key="use_value_type_exception" value="false"/>
                    <parameter key="except_value_type" value="time"/>
                    <parameter key="block_type" value="attribute_block"/>
                    <parameter key="use_block_type_exception" value="false"/>
                    <parameter key="except_block_type" value="value_matrix_row_start"/>
                    <parameter key="invert_selection" value="false"/>
                    <parameter key="include_special_attributes" value="false"/>
                    <description align="center" color="transparent" colored="false" width="126">current attribute</description>
                  </operator>
                  <operator activated="true" class="rename" compatibility="9.3.001" expanded="true" height="82" name="Rename" width="90" x="179" y="34">
                    <parameter key="old_name" value="%{ca}"/>
                    <parameter key="new_name" value="count"/>
                    <list key="rename_additional_attributes"/>
                    <description align="center" color="transparent" colored="false" width="126">rename it to something common</description>
                  </operator>
                  <operator activated="true" class="generate_attributes" compatibility="9.3.001" expanded="true" height="82" name="Generate Attributes" width="90" x="313" y="34">
                    <list key="function_descriptions">
                      <parameter key="keyword" value="%{ca}"/>
                    </list>
                    <parameter key="keep_all" value="true"/>
                    <description align="center" color="transparent" colored="false" width="126">add the attribute (the word) as a value</description>
                  </operator>
                  <operator activated="true" class="filter_examples" compatibility="9.3.001" expanded="true" height="103" name="Filter Examples (2)" width="90" x="447" y="34">
                    <parameter key="parameter_expression" value=""/>
                    <parameter key="condition_class" value="custom_filters"/>
                    <parameter key="invert_filter" value="false"/>
                    <list key="filters_list">
                      <parameter key="filters_entry_key" value="count.ne.0"/>
                    </list>
                    <parameter key="filters_logic_and" value="true"/>
                    <parameter key="filters_check_metadata" value="true"/>
                    <description align="center" color="transparent" colored="false" width="126">filter out all 0 values</description>
                  </operator>
                  <connect from_port="input 1" to_op="Select Attributes" to_port="example set input"/>
                  <connect from_op="Select Attributes" from_port="example set output" to_op="Rename" to_port="example set input"/>
                  <connect from_op="Rename" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
                  <connect from_op="Generate Attributes" from_port="example set output" to_op="Filter Examples (2)" to_port="example set input"/>
                  <connect from_op="Filter Examples (2)" from_port="example set output" to_port="output 1"/>
                  <portSpacing port="source_input 1" spacing="0"/>
                  <portSpacing port="source_input 2" spacing="0"/>
                  <portSpacing port="sink_output 1" spacing="0"/>
                  <portSpacing port="sink_output 2" spacing="0"/>
                </process>
                <description align="center" color="transparent" colored="false" width="126">loop through all the attributes and throw out the ones not linked to the label (value = 0)</description>
              </operator>
              <operator activated="true" class="append" compatibility="9.3.001" expanded="true" height="82" name="Append" width="90" x="313" y="34">
                <parameter key="datamanagement" value="double_array"/>
                <parameter key="data_management" value="auto"/>
                <parameter key="merge_type" value="all"/>
                <description align="center" color="transparent" colored="false" width="126">stick all together</description>
              </operator>
              <connect from_port="input 1" to_op="Filter Examples" to_port="example set input"/>
              <connect from_op="Filter Examples" from_port="example set output" to_op="Loop Attributes" to_port="input 1"/>
              <connect from_op="Loop Attributes" from_port="output 1" to_op="Append" to_port="example set 1"/>
              <connect from_op="Append" from_port="merged set" to_port="output 1"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
            <description align="center" color="transparent" colored="false" width="126">loop through labels</description>
          </operator>
          <operator activated="true" class="append" compatibility="9.3.001" expanded="true" height="82" name="Append (2)" width="90" x="179" y="34">
            <parameter key="datamanagement" value="double_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="merge_type" value="all"/>
          </operator>
          <operator activated="true" class="aggregate" compatibility="9.3.001" expanded="true" height="82" name="Aggregate" width="90" x="313" y="34">
            <parameter key="use_default_aggregation" value="false"/>
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="default_aggregation_function" value="average"/>
            <list key="aggregation_attributes">
              <parameter key="count" value="count"/>
            </list>
            <parameter key="group_by_attributes" value="keyword|label"/>
            <parameter key="count_all_combinations" value="false"/>
            <parameter key="only_distinct" value="false"/>
            <parameter key="ignore_missings" value="true"/>
          </operator>
          <operator activated="true" class="rename" compatibility="9.3.001" expanded="true" height="82" name="Rename (2)" width="90" x="447" y="34">
            <parameter key="old_name" value="count(count)"/>
            <parameter key="new_name" value="Qty"/>
            <list key="rename_additional_attributes"/>
            <description align="center" color="transparent" colored="false" width="126">some label cleanup</description>
          </operator>
          <operator activated="true" class="blending:pivot" compatibility="9.3.001" expanded="true" height="82" name="Pivot" width="90" x="581" y="34">
            <parameter key="group_by_attributes" value="keyword"/>
            <parameter key="column_grouping_attribute" value="label"/>
            <list key="aggregation_attributes">
              <parameter key="Qty" value="first"/>
            </list>
            <parameter key="use_default_aggregation" value="false"/>
            <parameter key="default_aggregation_function" value="first"/>
          </operator>
          <operator activated="true" class="replace_missing_values" compatibility="9.3.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="715" y="34">
            <parameter key="return_preprocessing_model" value="false"/>
            <parameter key="create_view" value="false"/>
            <parameter key="attribute_filter_type" value="value_type"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="numeric"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="default" value="value"/>
            <list key="columns"/>
            <parameter key="replenishment_value" value="0"/>
            <description align="center" color="transparent" colored="false" width="126">set all missing to 0</description>
          </operator>
          <operator activated="true" class="rename_by_replacing" compatibility="9.3.001" expanded="true" height="82" name="Rename by Replacing" width="90" x="849" y="34">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="replace_what" value="first\(Qty\)_"/>
            <description align="center" color="transparent" colored="false" width="126">some label cleanup</description>
          </operator>
          <connect from_port="in 1" to_op="Loop Values" to_port="input 1"/>
          <connect from_op="Loop Values" from_port="output 1" to_op="Append (2)" to_port="example set 1"/>
          <connect from_op="Append (2)" from_port="merged set" to_op="Aggregate" to_port="example set input"/>
          <connect from_op="Aggregate" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
          <connect from_op="Rename (2)" from_port="example set output" to_op="Pivot" to_port="input"/>
          <connect from_op="Pivot" from_port="output" to_op="Replace Missing Values" to_port="example set input"/>
          <connect from_op="Replace Missing Values" from_port="example set output" to_op="Rename by Replacing" to_port="example set input"/>
          <connect from_op="Rename by Replacing" from_port="example set output" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="neural_net" compatibility="9.3.001" expanded="true" height="82" name="Neural Net" width="90" x="715" y="34">
        <list key="hidden_layers"/>
        <parameter key="training_cycles" value="200"/>
        <parameter key="learning_rate" value="0.01"/>
        <parameter key="momentum" value="0.9"/>
        <parameter key="decay" value="false"/>
        <parameter key="shuffle" value="true"/>
        <parameter key="normalize" value="true"/>
        <parameter key="error_epsilon" value="1.0E-4"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <description align="center" color="transparent" colored="false" width="126">train your model</description>
      </operator>
      <operator activated="true" class="apply_model" compatibility="9.3.001" expanded="true" height="82" name="Apply Model" width="90" x="849" y="34">
        <list key="application_parameters"/>
        <parameter key="create_view" value="false"/>
        <description align="center" color="transparent" colored="false" width="126">apply your model&lt;br&gt;(really bad implementation as you should use new data but there isn't much)</description>
      </operator>
      <operator activated="true" class="neural_net" compatibility="9.3.001" expanded="true" height="82" name="Neural Net (2)" width="90" x="514" y="442">
        <list key="hidden_layers"/>
        <parameter key="training_cycles" value="200"/>
        <parameter key="learning_rate" value="0.01"/>
        <parameter key="momentum" value="0.9"/>
        <parameter key="decay" value="false"/>
        <parameter key="shuffle" value="true"/>
        <parameter key="normalize" value="true"/>
        <parameter key="error_epsilon" value="1.0E-4"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <description align="center" color="transparent" colored="false" width="126">train your model</description>
      </operator>
      <operator activated="true" class="apply_model" compatibility="9.3.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="648" y="442">
        <list key="application_parameters"/>
        <parameter key="create_view" value="false"/>
        <description align="center" color="transparent" colored="false" width="126">apply your model&lt;br&gt;(really bad implementation as you should use new data but there isn't much)</description>
      </operator>
      <connect from_op="Read CSV" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Neural Net" to_port="training set"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
      <connect from_op="WordList to Data" from_port="example set" to_op="Rename by Replacing (2)" to_port="example set input"/>
      <connect from_op="Rename by Replacing (2)" from_port="example set output" to_port="result 2"/>
      <connect from_op="Read CSV (2)" from_port="output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="Neural Net (2)" to_port="training set"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Subprocess" to_port="in 1"/>
      <connect from_op="Subprocess" from_port="out 1" to_port="result 4"/>
      <connect from_op="Neural Net" from_port="model" to_op="Apply Model" to_port="model"/>
      <connect from_op="Neural Net" from_port="exampleSet" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
      <connect from_op="Neural Net (2)" from_port="model" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Neural Net (2)" from_port="exampleSet" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
      <description align="center" color="yellow" colored="false" height="359" resized="true" width="1137" x="51" y="388">Using export from python</description>
      <description align="center" color="green" colored="true" height="362" resized="true" width="1140" x="47" y="10">Simulate Python process</description>
    </process>
  </operator>
</process>

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

How to create a process to already prepared data (NLP, NN)

Answers