Ensemble method for multiple data sets

lansuminc · August 2014

Hi,

I am currently working in RapidMiner 4.6.

I have extracted features from my main data set into 2 sets of features. One is a word vector (53 features) and the other is a set with 10 different features.

I have 2 different classifiers that I would like to combine in an ensemble method:

Logistic Regression on the word vector
W-J48graft on a different set of features

From my understanding I can only use operators such as stacking and voting if I give it one and the same data set as input.

How would I go about combining predictions from both my data sets using an ensemble method?

Thank you in advance!

homburg · August 2014

Hi lansuminc,

ensemble methods require that your features (in your case the word vector and the 10-feature set) are part of every single instance in your data set. Therefor you could join them together to one example set and inside the vote or stacking operator use one attribute filter for every learner to hide the unwanted features from your learners. Please note that voting operator performs majority voting for classification tasks, therefor you might need more than two learners inside...

BUT, what is really not possible is to combine a regression learner with a classification approach (unless you have both a categorical label and a corresponding numerical value for each example, which is a rather bizarre setup)

Here is an example for a simple stacking approach with different input for all base learners:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.0.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Root">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="6.0.008" expanded="true" height="60" name="Sonar" width="90" x="380" y="30">
        <parameter key="repository_entry" value="//Samples/data/Sonar"/>
      </operator>
      <operator activated="true" class="split_validation" compatibility="6.0.008" expanded="true" height="112" name="Validation" width="90" x="514" y="30">
        <process expanded="true">
          <operator activated="true" class="stacking" compatibility="6.0.008" expanded="true" height="60" name="Stacking" width="90" x="112" y="30">
            <process expanded="true">
              <operator activated="true" class="select_attributes" compatibility="6.0.008" expanded="true" height="76" name="Select Attributes" width="90" x="112" y="30">
                <parameter key="attribute_filter_type" value="subset"/>
                <parameter key="attributes" value="attribute_29|attribute_28|attribute_27|attribute_26|attribute_25|attribute_24|attribute_23|attribute_22|attribute_21|attribute_20|attribute_2|attribute_19|attribute_18|attribute_17|attribute_16|attribute_15|attribute_14|attribute_13|attribute_12|attribute_11|attribute_10|attribute_1|class"/>
              </operator>
              <operator activated="true" class="decision_tree" compatibility="6.0.008" expanded="true" height="76" name="Decision Tree" width="90" x="246" y="30"/>
              <operator activated="true" class="select_attributes" compatibility="6.0.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="112" y="210">
                <parameter key="attribute_filter_type" value="subset"/>
                <parameter key="attributes" value="attribute_49|attribute_48|attribute_47|attribute_46|attribute_45|attribute_44|attribute_43|attribute_42|attribute_41|attribute_40|attribute_4|attribute_39|attribute_38|attribute_37|attribute_36|attribute_35|attribute_34|attribute_33|attribute_32|attribute_31|attribute_30|attribute_3|class"/>
              </operator>
              <operator activated="true" class="k_nn" compatibility="6.0.008" expanded="true" height="76" name="K-NN" width="90" x="246" y="210">
                <parameter key="k" value="5"/>
              </operator>
              <operator activated="true" class="select_attributes" compatibility="6.0.008" expanded="true" height="76" name="Select Attributes (3)" width="90" x="112" y="390">
                <parameter key="attribute_filter_type" value="subset"/>
                <parameter key="attributes" value="class|attribute_9|attribute_8|attribute_7|attribute_60|attribute_6|attribute_59|attribute_58|attribute_57|attribute_56|attribute_55|attribute_54|attribute_53|attribute_52|attribute_51|attribute_50|attribute_5"/>
              </operator>
              <operator activated="true" class="linear_regression" compatibility="6.0.008" expanded="true" height="94" name="Linear Regression" width="90" x="246" y="390"/>
              <connect from_port="training set 1" to_op="Select Attributes" to_port="example set input"/>
              <connect from_port="training set 2" to_op="Select Attributes (2)" to_port="example set input"/>
              <connect from_port="training set 3" to_op="Select Attributes (3)" to_port="example set input"/>
              <connect from_op="Select Attributes" from_port="example set output" to_op="Decision Tree" to_port="training set"/>
              <connect from_op="Decision Tree" from_port="model" to_port="base model 1"/>
              <connect from_op="Select Attributes (2)" from_port="example set output" to_op="K-NN" to_port="training set"/>
              <connect from_op="K-NN" from_port="model" to_port="base model 2"/>
              <connect from_op="Select Attributes (3)" from_port="example set output" to_op="Linear Regression" to_port="training set"/>
              <connect from_op="Linear Regression" from_port="model" to_port="base model 3"/>
              <portSpacing port="source_training set 1" spacing="0"/>
              <portSpacing port="source_training set 2" spacing="0"/>
              <portSpacing port="source_training set 3" spacing="0"/>
              <portSpacing port="source_training set 4" spacing="0"/>
              <portSpacing port="sink_base model 1" spacing="0"/>
              <portSpacing port="sink_base model 2" spacing="0"/>
              <portSpacing port="sink_base model 3" spacing="0"/>
              <portSpacing port="sink_base model 4" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="naive_bayes" compatibility="6.0.008" expanded="true" height="76" name="Naive Bayes" width="90" x="123" y="30"/>
              <connect from_port="stacking examples" to_op="Naive Bayes" to_port="training set"/>
              <connect from_op="Naive Bayes" from_port="model" to_port="stacking model"/>
              <portSpacing port="source_stacking examples" spacing="0"/>
              <portSpacing port="sink_stacking model" spacing="0"/>
            </process>
          </operator>
          <connect from_port="training" to_op="Stacking" to_port="training set"/>
          <connect from_op="Stacking" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="6.0.008" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance" compatibility="6.0.008" expanded="true" height="76" name="Performance" width="90" x="179" y="30"/>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Sonar" from_port="output" to_op="Validation" to_port="training"/>
      <connect from_op="Validation" from_port="model" to_port="result 1"/>
      <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="18"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Cheers,
Helge

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

Ensemble method for multiple data sets

Answers