Am I building my Deep Learning model right?
I'm building a Deep Learning model in Rapidminer right now and I have some basic knowledge in Machine Learning, but I'm sometimes a bit confused how to implement my ideas in Rapidminer.
The basic idea is this:
- I have a dataset which I cluster with an HMM before the training in Rapidminer - The goal is to use the data from one state ("E1") to train my Deep Learning model and afterwards predict some Label ("TRUE"/"FALSE") in the rest of the dataset (so in principal every state, except E1)
- Plus, I have a class imbalance in my data, I have way more "FALSE" labels than "TRUE" ones
My way of implementing this problem in Rapidminer is this:
- I retrieve the two datasets and create weights for the different labels (TRUE gets assigned a weight of 10, FALSE gets assigned a weight of 1) for dealing with the class imbalance. Afterwards I sample 50% of the training set, and run it through a Deep Learning classifier including a Leave-One-Out Cross-Validation
- Afterwards I apply this model to the Test set and predict the Performance (Binomial, because it's a binomial label)
I appended the two input files (anonymized in a way, that I can post them here and they're still enough for training/testing) and my Process as an XML-file.
My question now is, if there are any pitfalls or any basic things I'm overlooking? I'm still quite a beginner in the Machine Learning department and a complete beginner in Rapidminer. I'm just not sure if my way is scientificly correct, or if it could be better implemented in Rapidminer.
Best regards and thanks for your help
Jakob
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.5.003" expanded="true" height="68" name="Retrieve Vista_wo_E1" width="90" x="45" y="238">
<parameter key="repository_entry" value="//Local Repository/Vista_wo_E1"/>
</operator>
<operator activated="true" class="retrieve" compatibility="7.5.003" expanded="true" height="68" name="Retrieve Vista_E1" width="90" x="45" y="34">
<parameter key="repository_entry" value="//Local Repository/Vista_E1"/>
</operator>
<operator activated="true" class="subprocess" compatibility="7.5.003" expanded="true" height="103" name="Set weight" width="90" x="179" y="34">
<process expanded="true">
<operator activated="true" class="generate_attributes" compatibility="7.5.003" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="45" y="187">
<list key="function_descriptions">
<parameter key="label" value="if([isHeart] == "FALSE",0,1)"/>
</list>
<parameter key="keep_all" value="true"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.5.003" expanded="true" height="82" name="Generate Attributes (5)" width="90" x="179" y="187">
<list key="function_descriptions">
<parameter key="weight" value="if([label]==0,1,10)"/>
</list>
<parameter key="keep_all" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="7.5.003" expanded="true" height="82" name="Set Role (3)" width="90" x="313" y="187">
<parameter key="attribute_name" value="weight"/>
<parameter key="target_role" value="weight"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="numerical_to_binominal" compatibility="7.5.003" expanded="true" height="82" name="Numerical to Binominal (2)" width="90" x="514" y="187">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="numeric"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="real"/>
<parameter key="block_type" value="value_series"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_series_end"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="min" value="0.0"/>
<parameter key="max" value="0.0"/>
</operator>
<operator activated="true" class="sample_bootstrapping" compatibility="7.5.003" expanded="true" height="82" name="Sample (3)" width="90" x="715" y="187">
<parameter key="sample" value="relative"/>
<parameter key="sample_size" value="100"/>
<parameter key="sample_ratio" value="0.5"/>
<parameter key="use_weights" value="true"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="numerical_to_polynominal" compatibility="7.5.003" expanded="true" height="82" name="Numerical to Polynominal" width="90" x="45" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="99|98|97|96|95|94|93|92|91|90|9|89|88|87|86|85|84|83|82|81|80|8|79|78|77|76|75|74|73|72|71|70|7|69|68|67|66|65|64|63|62|61|60|6|59|58|57|56|55|54|53|52|51|50|5|49|48|47|46|45|44|43|42|41|40|4|39|38|37|36|35|34|33|32|31|30|3|29|28|27|26|25|24|23|22|21|20|2|192|191|190|19|189|188|187|186|185|184|183|182|181|180|18|179|178|177|176|175|174|173|172|171|170|17|169|168|167|166|165|164|163|162|161|160|16|159|158|157|156|155|154|153|152|151|150|15|149|148|147|146|145|144|143|142|141|140|14|139|138|137|136|135|134|133|132|131|130|13|129|128|127|126|125|124|123|122|121|120|12|119|118|117|116|115|114|113|112|111|110|11|109|108|107|106|105|104|103|102|101|100|10|1"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="numeric"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="real"/>
<parameter key="block_type" value="value_series"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_series_end"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.5.003" expanded="true" height="82" name="Generate Attributes (3)" width="90" x="179" y="34">
<list key="function_descriptions">
<parameter key="label" value="if([isHeart] == "FALSE",0,1)"/>
</list>
<parameter key="keep_all" value="true"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.5.003" expanded="true" height="82" name="Generate Attributes (4)" width="90" x="313" y="34">
<list key="function_descriptions">
<parameter key="weight" value="if([label]==0,1,10)"/>
</list>
<parameter key="keep_all" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="7.5.003" expanded="true" height="82" name="Set Role (2)" width="90" x="447" y="34">
<parameter key="attribute_name" value="weight"/>
<parameter key="target_role" value="weight"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="numerical_to_binominal" compatibility="7.5.003" expanded="true" height="82" name="Numerical to Binominal" width="90" x="581" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="numeric"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="real"/>
<parameter key="block_type" value="value_series"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_series_end"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="min" value="0.0"/>
<parameter key="max" value="0.0"/>
</operator>
<operator activated="true" class="sample_bootstrapping" compatibility="7.5.003" expanded="true" height="82" name="Sample (2)" width="90" x="715" y="34">
<parameter key="sample" value="relative"/>
<parameter key="sample_size" value="100"/>
<parameter key="sample_ratio" value="0.5"/>
<parameter key="use_weights" value="true"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<connect from_port="in 1" to_op="Numerical to Polynominal" to_port="example set input"/>
<connect from_port="in 2" to_op="Generate Attributes (2)" to_port="example set input"/>
<connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Generate Attributes (5)" to_port="example set input"/>
<connect from_op="Generate Attributes (5)" from_port="example set output" to_op="Set Role (3)" to_port="example set input"/>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Numerical to Binominal (2)" to_port="example set input"/>
<connect from_op="Numerical to Binominal (2)" from_port="example set output" to_op="Sample (3)" to_port="example set input"/>
<connect from_op="Sample (3)" from_port="example set output" to_port="out 2"/>
<connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Generate Attributes (3)" to_port="example set input"/>
<connect from_op="Generate Attributes (3)" from_port="example set output" to_op="Generate Attributes (4)" to_port="example set input"/>
<connect from_op="Generate Attributes (4)" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Numerical to Binominal" to_port="example set input"/>
<connect from_op="Numerical to Binominal" from_port="example set output" to_op="Sample (2)" to_port="example set input"/>
<connect from_op="Sample (2)" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="source_in 3" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
<portSpacing port="sink_out 3" spacing="0"/>
</process>
</operator>
<operator activated="true" class="concurrency:cross_validation" compatibility="7.5.003" expanded="true" height="145" name="Cross Validation" width="90" x="313" y="34">
<parameter key="split_on_batch_attribute" value="false"/>
<parameter key="leave_one_out" value="true"/>
<parameter key="number_of_folds" value="10"/>
<parameter key="sampling_type" value="automatic"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
<parameter key="enable_parallel_execution" value="true"/>
<process expanded="true">
<operator activated="true" class="h2o:deep_learning" compatibility="7.5.000" expanded="true" height="82" name="Deep Learning" width="90" x="112" y="34">
<parameter key="activation" value="Rectifier"/>
<enumeration key="hidden_layer_sizes">
<parameter key="hidden_layer_sizes" value="50"/>
<parameter key="hidden_layer_sizes" value="50"/>
<parameter key="hidden_layer_sizes" value="50"/>
<parameter key="hidden_layer_sizes" value="50"/>
<parameter key="hidden_layer_sizes" value="50"/>
</enumeration>
<enumeration key="hidden_dropout_ratios"/>
<parameter key="reproducible_(uses_1_thread)" value="false"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
<parameter key="epochs" value="10.0"/>
<parameter key="compute_variable_importances" value="false"/>
<parameter key="train_samples_per_iteration" value="-2"/>
<parameter key="adaptive_rate" value="true"/>
<parameter key="epsilon" value="1.0E-8"/>
<parameter key="rho" value="0.99"/>
<parameter key="learning_rate" value="0.005"/>
<parameter key="learning_rate_annealing" value="1.0E-6"/>
<parameter key="learning_rate_decay" value="1.0"/>
<parameter key="momentum_start" value="0.0"/>
<parameter key="momentum_ramp" value="1000000.0"/>
<parameter key="momentum_stable" value="0.0"/>
<parameter key="nesterov_accelerated_gradient" value="true"/>
<parameter key="standardize" value="true"/>
<parameter key="L1" value="1.0E-5"/>
<parameter key="L2" value="0.0"/>
<parameter key="max_w2" value="10.0"/>
<parameter key="loss_function" value="Automatic"/>
<parameter key="distribution_function" value="AUTO"/>
<parameter key="early_stopping" value="false"/>
<parameter key="stopping_rounds" value="1"/>
<parameter key="stopping_metric" value="AUTO"/>
<parameter key="stopping_tolerance" value="0.001"/>
<parameter key="missing_values_handling" value="MeanImputation"/>
<parameter key="max_runtime_seconds" value="0"/>
<list key="expert_parameters"/>
<list key="expert_parameters_"/>
</operator>
<connect from_port="training set" to_op="Deep Learning" to_port="training set"/>
<connect from_op="Deep Learning" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="7.5.003" expanded="true" height="82" name="Apply Model" width="90" x="112" y="34">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="7.5.003" expanded="true" height="82" name="Performance (2)" width="90" x="246" y="136">
<parameter key="main_criterion" value="first"/>
<parameter key="accuracy" value="true"/>
<parameter key="classification_error" value="false"/>
<parameter key="kappa" value="false"/>
<parameter key="weighted_mean_recall" value="false"/>
<parameter key="weighted_mean_precision" value="false"/>
<parameter key="spearman_rho" value="false"/>
<parameter key="kendall_tau" value="false"/>
<parameter key="absolute_error" value="false"/>
<parameter key="relative_error" value="false"/>
<parameter key="relative_error_lenient" value="false"/>
<parameter key="relative_error_strict" value="false"/>
<parameter key="normalized_absolute_error" value="false"/>
<parameter key="root_mean_squared_error" value="true"/>
<parameter key="root_relative_squared_error" value="false"/>
<parameter key="squared_error" value="false"/>
<parameter key="correlation" value="false"/>
<parameter key="squared_correlation" value="false"/>
<parameter key="cross-entropy" value="false"/>
<parameter key="margin" value="false"/>
<parameter key="soft_margin_loss" value="false"/>
<parameter key="logistic_loss" value="false"/>
<parameter key="skip_undefined_labels" value="true"/>
<parameter key="use_example_weights" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
<connect from_op="Performance (2)" from_port="performance" to_port="performance 1"/>
<connect from_op="Performance (2)" from_port="example set" to_port="test set results"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="7.5.003" expanded="true" height="82" name="Apply Model (2)" width="90" x="447" y="289">
<list key="application_parameters"/>
<parameter key="create_view" value="true"/>
</operator>
<connect from_op="Retrieve Vista_wo_E1" from_port="output" to_op="Set weight" to_port="in 2"/>
<connect from_op="Retrieve Vista_E1" from_port="output" to_op="Set weight" to_port="in 1"/>
<connect from_op="Set weight" from_port="out 1" to_op="Cross Validation" to_port="example set"/>
<connect from_op="Set weight" from_port="out 2" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Cross Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Answers
hello @jakob_roetner - I'm going to pass this on to others in the team who know the DL operators better than I. Maybe @jpuente?
Thank you very much for your effort Highly appreciated!