Preprocessing ideas to improve Memory/RAM usage efficiency
Hi,
For a project i am doing research on the relation between financial ratios of firms and their long term stock returns. I have been trying to reach derived prices of stocks including dividend distributions of stocks in three nested loops. I have a data set including ten years of data with around 845,000 rows and 60 columns. In three nested loops i first loop through the values of stock id s and calculate derived prices. In every loop i store the resulting small set in order to be able to free memory. Sometimes i saw 18-20 percent values on loop values operator. Yet i could not succeed to finish it. I always end up with frozen RM. My PC has 16 GB RAM and 128 GB SSD disk with some pagefile. Below is the XML for this process.
How could i improve this process or how should i change my perspective? Thanks in advance.
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.1.003" expanded="true" height="68" name="Retrieve monthly_stock_2008_2017_permno" width="90" x="45" y="34">
<parameter key="repository_entry" value="../01 Data/monthly_stock_2008_2017_permno"/>
</operator>
<operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="82" name="ID & Sort" width="90" x="179" y="34">
<process expanded="true">
<operator activated="true" class="date_to_numerical" compatibility="8.1.003" expanded="true" height="82" name="Date to Numerical" width="90" x="45" y="34">
<parameter key="attribute_name" value="Alternate Price Date"/>
<parameter key="time_unit" value="month"/>
<parameter key="hour_relative_to" value="epoch"/>
<parameter key="day_relative_to" value="epoch"/>
<parameter key="month_relative_to" value="epoch"/>
<parameter key="keep_old_attribute" value="true"/>
</operator>
<operator activated="true" class="sort" compatibility="8.1.003" expanded="true" height="82" name="Sort" width="90" x="180" y="34">
<parameter key="attribute_name" value="Payment Date"/>
</operator>
<operator activated="true" class="sort" compatibility="8.1.003" expanded="true" height="82" name="Sort (2)" width="90" x="313" y="34">
<parameter key="attribute_name" value="Alternate Price Date"/>
</operator>
<operator activated="true" class="sort" compatibility="8.1.003" expanded="true" height="82" name="Sort (3)" width="90" x="447" y="34">
<parameter key="attribute_name" value="PERMNO"/>
</operator>
<operator activated="true" class="generate_id" compatibility="8.1.003" expanded="true" height="82" name="Generate ID" width="90" x="581" y="34"/>
<connect from_port="in 1" to_op="Date to Numerical" to_port="example set input"/>
<connect from_op="Date to Numerical" from_port="example set output" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Sort (2)" to_port="example set input"/>
<connect from_op="Sort (2)" from_port="example set output" to_op="Sort (3)" to_port="example set input"/>
<connect from_op="Sort (3)" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="generate_attributes" compatibility="8.1.003" expanded="true" height="82" name="Generate Attributes" width="90" x="313" y="85">
<list key="function_descriptions">
<parameter key="Return Factor" value="Returns+1.00"/>
<parameter key="Derived Price" value="1.00"/>
</list>
</operator>
<operator activated="true" class="numerical_to_polynominal" compatibility="8.1.003" expanded="true" height="82" name="Numerical to Polynominal" width="90" x="447" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="PERMNO"/>
</operator>
<operator activated="true" class="concurrency:loop_values" compatibility="8.1.003" expanded="true" height="103" name="Loop Values" width="90" x="581" y="85">
<parameter key="attribute" value="PERMNO"/>
<parameter key="enable_parallel_execution" value="false"/>
<process expanded="true">
<operator activated="true" class="filter_examples" compatibility="8.1.003" expanded="true" height="103" name="Filter permno & nonmissing" width="90" x="45" y="34">
<list key="filters_list">
<parameter key="filters_entry_key" value="PERMNO.equals.%{loop_value}"/>
<parameter key="filters_entry_key" value="Returns.is_not_missing."/>
</list>
</operator>
<operator activated="true" class="loop_examples" compatibility="8.1.003" expanded="true" height="82" name="Loop Examples" width="90" x="179" y="34">
<parameter key="iteration_macro" value="loop_example_outer"/>
<process expanded="true">
<operator activated="true" class="generate_macro" compatibility="8.1.003" expanded="true" height="82" name="Generate Macro (3)" width="90" x="45" y="34">
<list key="function_descriptions">
<parameter key="kumulatif_fiyat" value="1.00"/>
</list>
</operator>
<operator activated="true" class="concurrency:loop" compatibility="8.1.003" expanded="true" height="82" name="Loop" width="90" x="179" y="34">
<parameter key="number_of_iterations" value="%{loop_example_outer}"/>
<parameter key="reuse_results" value="true"/>
<process expanded="true">
<operator activated="true" class="extract_macro" compatibility="8.1.003" expanded="true" height="68" name="Extract Macro" width="90" x="179" y="34">
<parameter key="macro" value="tekil_fiyat"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="Return Factor"/>
<parameter key="example_index" value="%{iteration}"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="generate_macro" compatibility="8.1.003" expanded="true" height="82" name="Generate Macro (2)" width="90" x="313" y="34">
<list key="function_descriptions">
<parameter key="kumulatif_fiyat" value="eval(%{kumulatif_fiyat})*eval(%{tekil_fiyat})"/>
</list>
</operator>
<connect from_port="input 1" to_op="Extract Macro" to_port="example set"/>
<connect from_op="Extract Macro" from_port="example set" to_op="Generate Macro (2)" to_port="through 1"/>
<connect from_op="Generate Macro (2)" from_port="through 1" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_data" compatibility="8.1.003" expanded="true" height="82" name="Set Data" width="90" x="313" y="34">
<parameter key="example_index" value="%{loop_example_outer}"/>
<parameter key="attribute_name" value="Derived Price"/>
<parameter key="value" value="%{kumulatif_fiyat}"/>
<list key="additional_values"/>
</operator>
<connect from_port="example set" to_op="Generate Macro (3)" to_port="through 1"/>
<connect from_op="Generate Macro (3)" from_port="through 1" to_op="Loop" to_port="input 1"/>
<connect from_op="Loop" from_port="output 1" to_op="Set Data" to_port="example set input"/>
<connect from_op="Set Data" from_port="example set output" to_port="example set"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="8.1.003" expanded="true" height="82" name="Append Prices" width="90" x="313" y="34"/>
<operator activated="true" class="filter_examples" compatibility="8.1.003" expanded="true" height="103" name="Filter permno" width="90" x="45" y="187">
<list key="filters_list">
<parameter key="filters_entry_key" value="PERMNO.eq.%{loop_value}"/>
</list>
</operator>
<operator activated="true" class="concurrency:join" compatibility="8.1.003" expanded="true" height="82" name="Join" width="90" x="447" y="136">
<parameter key="join_type" value="left"/>
<list key="key_attributes"/>
</operator>
<operator activated="true" class="store" compatibility="8.1.003" expanded="true" height="68" name="Store" width="90" x="581" y="136">
<parameter key="repository_entry" value="//StockInvestment/01 Data/monthly stock derived prices/%{loop_value}"/>
</operator>
<operator activated="true" class="free_memory" compatibility="8.1.003" expanded="true" height="82" name="Free Memory" width="90" x="715" y="136"/>
<operator activated="false" class="handle_exception" compatibility="8.1.003" expanded="true" height="82" name="Handle Exception" width="90" x="246" y="340">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.1.003" expanded="true" height="68" name="Retrieve Derived Price File" width="90" x="112" y="34">
<parameter key="repository_entry" value="//StockInvestment/01 Data/monthly_stock_2008_2017_permno_derived_prices"/>
</operator>
<connect from_op="Retrieve Derived Price File" from_port="output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
<process expanded="true">
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="false" class="append" compatibility="8.1.003" expanded="true" height="82" name="Append Stocks" width="90" x="380" y="442"/>
<operator activated="false" class="remove_duplicates" compatibility="8.1.003" expanded="true" height="103" name="Remove Duplicates" width="90" x="514" y="442">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="false" class="store" compatibility="8.1.003" expanded="true" height="68" name="Store Derived Price File" width="90" x="648" y="442">
<parameter key="repository_entry" value="//StockInvestment/01 Data/monthly_stock_2008_2017_permno_derived_prices"/>
</operator>
<operator activated="false" breakpoints="before,after" class="delete_repository_entry" compatibility="8.1.003" expanded="true" height="68" name="Delete Repository Entry" width="90" x="782" y="442">
<parameter key="entry_to_delete" value="//StockInvestment/01 Data/monthly stock derived prices/%{loop_value}"/>
<description align="center" color="transparent" colored="false" width="126">to be activated if necessary</description>
</operator>
<operator activated="false" class="free_memory" compatibility="8.1.003" expanded="true" height="82" name="Free Memory (2)" width="90" x="916" y="442"/>
<connect from_port="input 1" to_op="Filter permno & nonmissing" to_port="example set input"/>
<connect from_port="input 2" to_op="Filter permno" to_port="example set input"/>
<connect from_op="Filter permno & nonmissing" from_port="example set output" to_op="Loop Examples" to_port="example set"/>
<connect from_op="Loop Examples" from_port="example set" to_op="Append Prices" to_port="example set 1"/>
<connect from_op="Append Prices" from_port="merged set" to_op="Join" to_port="right"/>
<connect from_op="Filter permno" from_port="example set output" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_op="Store" to_port="input"/>
<connect from_op="Store" from_port="through" to_op="Free Memory" to_port="through 1"/>
<connect from_op="Free Memory" from_port="through 1" to_port="output 1"/>
<connect from_op="Handle Exception" from_port="out 1" to_op="Append Stocks" to_port="example set 1"/>
<connect from_op="Append Stocks" from_port="merged set" to_op="Remove Duplicates" to_port="example set input"/>
<connect from_op="Remove Duplicates" from_port="example set output" to_op="Store Derived Price File" to_port="input"/>
<connect from_op="Store Derived Price File" from_port="through" to_op="Free Memory (2)" to_port="through 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="105"/>
<portSpacing port="source_input 3" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="8.1.003" expanded="true" height="82" name="Append Final" width="90" x="715" y="85"/>
<connect from_op="Retrieve monthly_stock_2008_2017_permno" from_port="output" to_op="ID & Sort" to_port="in 1"/>
<connect from_op="ID & Sort" from_port="out 1" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Numerical to Polynominal" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="original" to_op="Loop Values" to_port="input 2"/>
<connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Loop Values" to_port="input 1"/>
<connect from_op="Loop Values" from_port="output 1" to_op="Append Final" to_port="example set 1"/>
<connect from_op="Append Final" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Best Answer
-
Edin_Klapic Employee-RapidMiner, RMResearcher, Member Posts: 299 RM Data Scientist
Hi @suleymansahal,
At a quick glance I could identify 3 nested Loop Operators. Those are performance and runtime bottlenecks in every process.
The only recommendation I can give is to replace Loops with other transformations and / or split your dataset beforehand to have less rows in the process
Ideally, for those resource hungry processes you would ideally use a RapidMiner Server dedicated for computing.
Happy Mining,
Edin
5
Answers
Thanks for the reply. The process was indeed heavy for my pc. Second level inner loop operator had append operator. It was not really necessary. I wanted to see interim result. When I removed it I was finally able to finish the whole process. So I learned append is really resource demanding operator. Thank you again.