The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
How to compare three lists?
Hi All!
I have a data set that has 3 columns. these 3 columns are having a list of words. I need to compare one list with 2 other lists(master lists) and get the count of words belongs list1 and list2. list1 and list2 are having unique words as master lists. list3 has words that belong to list1 and list2.
sample data:
list1 list2 list3
shape,size,whole type,diff,lit shape,lit
hole,jio,kit pen,tilm,mil pen,mil
Required output:
list1 list2 list3 count_list1 count_list2
shape,size,whole type,diff,lit shape,lit 1 1
hole,jio,kit pen,tilm,mil pen,kit 0 2
here list3 has
i. "shape","kit": "shape" is available in list1 so, count of list1 is "1" and "lit" is available in list2 so, the count_list2 is "1"
ii. "pen","kit" : "pen" and "kit" are avaialble in list2, the count_list2 is "2" and count_list1 is "0" because none of the words belongs to list1.
could anyone help me in resolving this?
Thanks in Advance!
I have a data set that has 3 columns. these 3 columns are having a list of words. I need to compare one list with 2 other lists(master lists) and get the count of words belongs list1 and list2. list1 and list2 are having unique words as master lists. list3 has words that belong to list1 and list2.
sample data:
list1 list2 list3
shape,size,whole type,diff,lit shape,lit
hole,jio,kit pen,tilm,mil pen,mil
Required output:
list1 list2 list3 count_list1 count_list2
shape,size,whole type,diff,lit shape,lit 1 1
hole,jio,kit pen,tilm,mil pen,kit 0 2
here list3 has
i. "shape","kit": "shape" is available in list1 so, count of list1 is "1" and "lit" is available in list2 so, the count_list2 is "1"
ii. "pen","kit" : "pen" and "kit" are avaialble in list2, the count_list2 is "2" and count_list1 is "0" because none of the words belongs to list1.
could anyone help me in resolving this?
Thanks in Advance!
0
Answers
I don't have a ready-made solution, but a few pointers.
You could use the Split operator to create attributes like list1_1, list1_2, list1_3 etc. with the unique words in the lists.
Then Loop Attributes on list2_.+ and list3_.+ (regular expression selectors) and inside that Generate Attributes that compares the current attribute value to the reference list and increases the count if there's a match.
Regards,
Balázs
I'm doing loop attributes on list3 after the split and checking with if(contains(list1,%{loop attribute}),1,0) but it's throwing error "Not enough iterations". unable to resolve this
could you please help me?
it is a property of many RapidMiner loops that they throw an exception if the input is empty.
The solution for this problem is using Handle Exception around the loop.
Just put the loop into the left part of the exception handler and connect the input with the output on the right side.
Regards,
Balázs
Maybe the approach I took on this thread may help you.
https://community.rapidminer.com/discussion/58965/which-amazon-instance-to-chose-for-a-loop-in-loop-process-requiring-a-huge-amount-of-memory#latest
Let me know if it helps.
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.9.002" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="utility:create_exampleset" compatibility="9.9.002" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
<parameter key="generator_type" value="comma separated text"/>
<parameter key="number_of_examples" value="100"/>
<parameter key="use_stepsize" value="false"/>
<list key="function_descriptions"/>
<parameter key="add_id_attribute" value="false"/>
<list key="numeric_series_configuration"/>
<list key="date_series_configuration"/>
<list key="date_series_configuration (interval)"/>
<parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="input_csv_text" value="list1;list2;list3 shape,size,whole;type,diff,lit;shape,lit hole,jio,kit;pen,tilm,mil;pen,mil"/>
<parameter key="column_separator" value=";"/>
<parameter key="parse_all_as_nominal" value="false"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="trim_attribute_names" value="true"/>
</operator>
<operator activated="true" class="generate_id" compatibility="9.9.002" expanded="true" height="82" name="Generate ID" width="90" x="45" y="187">
<parameter key="create_nominal_ids" value="false"/>
<parameter key="offset" value="0"/>
<description align="center" color="transparent" colored="false" width="126">create id for later join</description>
</operator>
<operator activated="true" class="blending:rename" compatibility="9.9.002" expanded="true" height="82" name="Rename" width="90" x="179" y="187">
<list key="rename attributes">
<parameter key="list3" value="master_list"/>
</list>
<parameter key="from_attribute" value=""/>
<parameter key="to_attribute" value=""/>
<description align="center" color="transparent" colored="false" width="126">rename list3 to master_list</description>
</operator>
<operator activated="true" class="transpose" compatibility="9.9.002" expanded="true" height="82" name="Transpose" width="90" x="313" y="187"/>
<operator activated="true" class="filter_examples" compatibility="9.9.002" expanded="true" height="103" name="Filter Examples" width="90" x="447" y="187">
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="custom_filters"/>
<parameter key="invert_filter" value="true"/>
<list key="filters_list">
<parameter key="filters_entry_key" value="id.equals.master_list"/>
</list>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
</operator>
<operator activated="true" class="blending:rename" compatibility="9.9.002" expanded="true" height="82" name="Rename (2)" width="90" x="581" y="187">
<list key="rename attributes">
<parameter key="id" value="lists"/>
</list>
<parameter key="from_attribute" value=""/>
<parameter key="to_attribute" value=""/>
</operator>
<operator activated="true" class="split" compatibility="9.9.002" expanded="true" height="82" name="Split" width="90" x="447" y="493">
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="split_pattern" value=","/>
<parameter key="split_mode" value="ordered_split"/>
<description align="center" color="transparent" colored="false" width="126">split attributes by separator</description>
</operator>
<operator activated="true" class="concurrency:loop_attributes" compatibility="9.9.002" expanded="true" height="82" name="Loop Attributes (4)" width="90" x="581" y="340">
<parameter key="attribute_filter_type" value="regular_expression"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="regular_expression" value="list.*"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="except_regular_expression" value="list3.*"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="attribute_name_macro" value="loop_attribute1"/>
<parameter key="reuse_results" value="true"/>
<parameter key="enable_parallel_execution" value="true"/>
<process expanded="true">
<operator activated="true" class="generate_attributes" compatibility="9.9.002" expanded="true" height="82" name="Generate Attributes (5)" width="90" x="112" y="34">
<list key="function_descriptions">
<parameter key="count_%{loop_attribute1}" value="0"/>
</list>
<parameter key="keep_all" value="true"/>
</operator>
<operator activated="true" class="concurrency:loop_attributes" compatibility="9.9.002" expanded="true" height="82" name="Loop Attributes (5)" width="90" x="380" y="34">
<parameter key="attribute_filter_type" value="regular_expression"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="regular_expression" value="master_list.*"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="attribute_name_macro" value="loop_attribute"/>
<parameter key="reuse_results" value="true"/>
<parameter key="enable_parallel_execution" value="true"/>
<process expanded="true">
<operator activated="true" class="generate_macro" compatibility="9.9.002" expanded="true" height="82" name="Generate Macro" width="90" x="179" y="34">
<list key="function_descriptions">
<parameter key="count_attribute" value=""count_"+%{loop_attribute1}"/>
</list>
</operator>
<operator activated="true" class="generate_attributes" compatibility="9.9.002" expanded="true" height="82" name="Generate Attributes (6)" width="90" x="380" y="34">
<list key="function_descriptions">
<parameter key="%{count_attribute}" value="if(missing(#{loop_attribute}),#{count_attribute},if(missing(#{loop_attribute1}),#{count_attribute},if(#{loop_attribute} == #{loop_attribute1},#{count_attribute}+1,#{count_attribute})))"/>
</list>
<parameter key="keep_all" value="true"/>
</operator>
<connect from_port="input 1" to_op="Generate Macro" to_port="through 1"/>
<connect from_op="Generate Macro" from_port="through 1" to_op="Generate Attributes (6)" to_port="example set input"/>
<connect from_op="Generate Attributes (6)" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<connect from_port="input 1" to_op="Generate Attributes (5)" to_port="example set input"/>
<connect from_op="Generate Attributes (5)" from_port="example set output" to_op="Loop Attributes (5)" to_port="input 1"/>
<connect from_op="Loop Attributes (5)" from_port="output 1" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">generate count_list Attributes for each element of master list</description>
</operator>
<operator activated="true" class="concurrency:loop_values" compatibility="9.9.002" expanded="true" height="103" name="Loop Values" width="90" x="715" y="187">
<parameter key="attribute" value="id"/>
<parameter key="iteration_macro" value="loop_value"/>
<parameter key="reuse_results" value="false"/>
<parameter key="enable_parallel_execution" value="true"/>
<process expanded="true">
<operator activated="true" class="generate_aggregation" compatibility="9.9.002" expanded="true" height="82" name="Generate Aggregation (2)" width="90" x="179" y="34">
<parameter key="attribute_name" value="count_%{loop_value}"/>
<parameter key="attribute_filter_type" value="regular_expression"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="regular_expression" value="count_%{loop_value}.*"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="aggregation_function" value="sum"/>
<parameter key="concatenation_separator" value="|"/>
<parameter key="keep_all" value="true"/>
<parameter key="ignore_missings" value="true"/>
<parameter key="ignore_missing_attributes" value="false"/>
</operator>
<connect from_port="input 2" to_op="Generate Aggregation (2)" to_port="example set input"/>
<connect from_op="Generate Aggregation (2)" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="source_input 3" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Generate aggregated count_list Attributes per list attribute</description>
</operator>
<operator activated="true" class="operator_toolbox:merge" compatibility="2.10.000" expanded="true" height="82" name="Merge Attributes" width="90" x="849" y="187">
<parameter key="handling_of_duplicate_attributes" value="keep_only_first"/>
<parameter key="handling_of_special_attributes" value="keep_first_special_other_regular"/>
<parameter key="handling_of_duplicate_annotations" value="rename"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="9.9.002" expanded="true" height="82" name="Select Attributes" width="90" x="849" y="340">
<parameter key="attribute_filter_type" value="regular_expression"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="regular_expression" value="count_list\d+"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<description align="center" color="transparent" colored="false" width="126">only keeps count Attributes</description>
</operator>
<operator activated="true" class="concurrency:join" compatibility="9.9.002" expanded="true" height="82" name="Join" width="90" x="983" y="493">
<parameter key="remove_double_attributes" value="true"/>
<parameter key="join_type" value="inner"/>
<parameter key="use_id_attribute_as_key" value="true"/>
<list key="key_attributes"/>
<parameter key="keep_both_join_attributes" value="false"/>
<description align="center" color="transparent" colored="false" width="126">join with original dataset</description>
</operator>
<connect from_op="Create ExampleSet" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Transpose" from_port="original" to_op="Split" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
<connect from_op="Rename (2)" from_port="example set output" to_op="Loop Values" to_port="input 1"/>
<connect from_op="Split" from_port="example set output" to_op="Loop Attributes (4)" to_port="input 1"/>
<connect from_op="Split" from_port="original" to_op="Join" to_port="right"/>
<connect from_op="Loop Attributes (4)" from_port="output 1" to_op="Loop Values" to_port="input 2"/>
<connect from_op="Loop Values" from_port="output 1" to_op="Merge Attributes" to_port="example set 1"/>
<connect from_op="Merge Attributes" from_port="merged set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Edin