The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
[SOLVED] Union of example sets
Hi All,
I need to append the resultset using different queries where the userID is changing. So first i read a list of userID from a CSV file (this file only has one column, userID). Then I can iterate over the userID using a Loop Value operator. Inside the loop operator I have a readDatabase operator using the following query:
SELECT
"movieID",
genre
FROM movielens10m."Movie"
WHERE "movieID" IN
(
SELECT
"Rating"."MovieID"
FROM
movielens10m."Rating"
WHERE
"Rating"."userID" = %{loop_value}
)
Genre is a compound attribute which contains something like value1|value3|value7. After the loop execution, It's sure that output has movieID. But genre maybe has different length of columns for each collection's item. When I try to use the append operator an error is araised because the length of columns is different for each collection's item. I can't use union because this operator does not receive a collection type.
What do you suggest me to do?
Thanks in advance. Below is the flow
I need to append the resultset using different queries where the userID is changing. So first i read a list of userID from a CSV file (this file only has one column, userID). Then I can iterate over the userID using a Loop Value operator. Inside the loop operator I have a readDatabase operator using the following query:
SELECT
"movieID",
genre
FROM movielens10m."Movie"
WHERE "movieID" IN
(
SELECT
"Rating"."MovieID"
FROM
movielens10m."Rating"
WHERE
"Rating"."userID" = %{loop_value}
)
Genre is a compound attribute which contains something like value1|value3|value7. After the loop execution, It's sure that output has movieID. But genre maybe has different length of columns for each collection's item. When I try to use the append operator an error is araised because the length of columns is different for each collection's item. I can't use union because this operator does not receive a collection type.
What do you suggest me to do?
Thanks in advance. Below is the flow
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.006" expanded="true" name="Process">
<process expanded="true" height="235" width="550">
<operator activated="true" class="read_csv" compatibility="5.2.006" expanded="true" height="60" name="Read CSV" width="90" x="45" y="30">
<parameter key="csv_file" value="C:\Users\asistente\Documents\TesisSF\RapidDS\Iterar10m.txt"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations"/>
<parameter key="encoding" value="windows-1252"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="userID.true.polynominal.attribute"/>
</list>
</operator>
<operator activated="true" class="loop_values" compatibility="5.2.006" expanded="true" height="76" name="Loop Values" width="90" x="179" y="30">
<parameter key="attribute" value="userID"/>
<process expanded="true" height="554" width="685">
<operator activated="true" class="read_database" compatibility="5.2.006" expanded="true" height="60" name="MoviesDB (2)" width="90" x="112" y="30">
<parameter key="connection" value="PG_movielens100k"/>
<parameter key="query" value="SELECT "movieID", genre FROM movielens10m."Movie" WHERE "movieID" IN ( SELECT "Rating"."MovieID" FROM movielens10m."Rating" WHERE "Rating"."userID" = %{loop_value} )"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.2.006" expanded="true" height="76" name="movieID_Movies (2)" width="90" x="246" y="30">
<parameter key="name" value="movieID"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="replace" compatibility="5.2.006" expanded="true" height="76" name="DashDelete (2)" width="90" x="380" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="genre"/>
<parameter key="replace_what" value="-"/>
</operator>
<operator activated="true" class="split" compatibility="5.2.006" expanded="true" height="76" name="Genres (2)" width="90" x="514" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="genre"/>
<parameter key="split_pattern" value="\|"/>
<parameter key="split_mode" value="unordered_split"/>
</operator>
<connect from_op="MoviesDB (2)" from_port="output" to_op="movieID_Movies (2)" to_port="example set input"/>
<connect from_op="movieID_Movies (2)" from_port="example set output" to_op="DashDelete (2)" to_port="example set input"/>
<connect from_op="DashDelete (2)" from_port="example set output" to_op="Genres (2)" to_port="example set input"/>
<connect from_op="Genres (2)" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="loop_collection" compatibility="5.2.006" expanded="true" height="60" name="Loop Collection" width="90" x="313" y="30">
<process expanded="true" height="554" width="685">
<operator activated="true" class="recall" compatibility="5.2.006" expanded="true" height="60" name="Recall" width="90" x="45" y="210">
<parameter key="name" value="movies"/>
<parameter key="io_object" value="ExampleSet"/>
</operator>
<operator activated="true" class="union" compatibility="5.2.006" expanded="true" height="76" name="Union" width="90" x="246" y="30"/>
<operator activated="true" class="remember" compatibility="5.2.006" expanded="true" height="60" name="Remember" width="90" x="447" y="30">
<parameter key="name" value="movies"/>
<parameter key="io_object" value="ExampleSet"/>
</operator>
<connect from_port="single" to_op="Union" to_port="example set 1"/>
<connect from_op="Recall" from_port="result" to_op="Union" to_port="example set 2"/>
<connect from_op="Union" from_port="union" to_op="Remember" to_port="store"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
</process>
</operator>
<operator activated="true" class="recall" compatibility="5.2.006" expanded="true" height="60" name="Recall (2)" width="90" x="450" y="30">
<parameter key="name" value="movies"/>
<parameter key="io_object" value="ExampleSet"/>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Loop Values" to_port="example set"/>
<connect from_op="Loop Values" from_port="out 1" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Recall (2)" from_port="result" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
0
Answers
Best, Marius
Just for information:
I just tried to reconstruct the given process in this thread when receiving an error that "no object with name XYZ was found...". In that case you have to take care on the order of operators in your process.
Solution: Open "view" -> "show view" -> "tree" and promote the "remember" operator to the top.
Hope that this helps.
PS: Still lots of operators just to bring a collection with different attributes together... but in the end it works.
All the best
Sachs
The only tricky thing here is how to define the first operator: for this, you have to right click the new first operator and select "bring to front". Obviously this only works if no other operator is connected to its input ports.
If I manage to shift off some time there will be a blog series about all these "hidden features" in RapidMiner.
Best,
~Marius