"RM 508 Simple Operators (Data Transformation using PCA)"

jingram · July 2010

Dear all,

My name is Jon and I have come across RM in my research for a Software Engineering thesis at the University of Sydney, Australia. I can see that RM is very powerful. Briefly, my thesis application should monitor web services, collect metrics, perform data transformation, perform outlier detection and notify administrators if any "faults" are detected. In my thesis application, I would like to perform the following operations on my data:

0. Feature collection (collecting various data)
1. RM operator: Data transformation (using PCA, or ICA, or Kernel PCA) - my application will select one of these feature extraction techniques based on how well it performs.
2. RM operator: Outlier detection (using any of the 4 operators, or any new operators that I write) - again, select an operation based on how well it detects outliers.
3: Identify outliers and notify administrators - i.e. get the results of outlier detection.

As you can see, I would like to run the operators like this: 0 --> 1 --> 2 --> 3

Unfortunately, I don't think the white paper would contain basic tutorial information I need at this early stage (maybe later for operator creation) and the wiki page: http://rapid-i.com/wiki/index.php?title=Integrating_RapidMiner_into_your_application "using single operators" section seems to be outdated (since it uses operator.apply which is deprecated).

OK, so I can do this sequence of operations in the GUI fine, and I can see that I want this process:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
	<context>
		<input/>
		<output/>
		<macros/>
	</context>
	<operator activated="true" class="process" compatibility="5.0.8" expanded="true" name="Process">
		<process expanded="true" height="550" width="949">
			<operator activated="true" class="retrieve" compatibility="5.0.8" expanded="true" height="60" name="Retrieve" width="90" x="236" y="283">
				<parameter key="repository_entry" value="../data/PCATutorial"/>
			</operator>
			<operator activated="true" class="principal_component_analysis" compatibility="5.0.8" expanded="true" height="94" name="PCA" width="90" x="447" y="210"/>
			<operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model" width="90" x="648" y="210">
				<list key="application_parameters">
					<parameter key="variance_threshold" value="0.95"/>
				</list>
			</operator>
			<connect from_op="Retrieve" from_port="output" to_op="PCA" to_port="example set input"/>
			<connect from_op="PCA" from_port="original" to_op="Apply Model" to_port="unlabelled data"/>
			<connect from_op="PCA" from_port="preprocessing model" to_op="Apply Model" to_port="model"/>
			<connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
			<connect from_op="Apply Model" from_port="model" to_port="result 2"/>
			<portSpacing port="source_input 1" spacing="0"/>
			<portSpacing port="sink_result 1" spacing="126"/>
			<portSpacing port="sink_result 2" spacing="0"/>
			<portSpacing port="sink_result 3" spacing="0"/>
		</process>
	</operator>
</process>

BTW, I don't especially want to directly use this XML file in process creation, 1. because I want to retrieve the data elsewhere, and 2. because I want to dynamically change what operations are done.

In code I have a file that looks like this:


package thesis.PCA;

import com.rapidminer.tools.OperatorService;
import com.rapidminer.Process;
import com.rapidminer.RapidMiner;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DoubleArrayDataRow;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.ModelApplier;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.generator.ExampleSetGenerator;
import com.rapidminer.tools.Ontology;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;

public class RMTest {

	public static void main(String[] args) {
		RapidMiner.init();

		// create process
		Process process = createProcess();

		// print process setup
		System.out.println(process.getRootOperator().createProcessTree(0));

		// create some input from application
		// later, this will be real data
		IOContainer input = createInput();

		try {
			// perform process
			process.run(input);
		} catch (OperatorException e) {
			e.printStackTrace();
		}
	}

	public static Process createProcess() {
		// create process
		Process process = new Process();
		try {
			// create operator to create some example data
			Operator inputOperator =
					OperatorService.createOperator(ExampleSetGenerator.class);

			// set parameters
			inputOperator.setParameter("target_function", "sum classification");

			// PCA
			Operator pca =
					OperatorService.createOperator(com.rapidminer.operator.features.transformation.PCA.class);

			// applying the model
			Operator modelApp =
					OperatorService.createOperator(ModelApplier.class);

                        // I believe these 3 lines of code do not connect my operators properly
			process.getRootOperator().getSubprocess(0).addOperator(inputOperator);
			process.getRootOperator().getSubprocess(0).addOperator(pca);
			process.getRootOperator().getSubprocess(0).addOperator(modelApp);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return process;
	}

	// code snippet taken from somewhere else
	private static IOContainer createInput() {
		List<Attribute> attributes = new LinkedList<Attribute>();

		for (int a = 0; a < 10; a++) {
			attributes.add(AttributeFactory.createAttribute("a" + a, Ontology.REAL));
		}

		Attribute label = AttributeFactory.createAttribute("class", Ontology.NOMINAL);
		attributes.add(label);
		Random rand = new Random();
		MemoryExampleTable table = new MemoryExampleTable(attributes);

		// Create 8 data intances and fill the data
		for (int d = 0; d < 8; d++) {
			double[] data = new double[attributes.size()];
			for (int dim = 0; dim < 10; dim++) {
				data[dim] = rand.nextDouble();
			}
			if (rand.nextBoolean()) {
				data[attributes.size() - 1] = 1d;
			} else {
				data[attributes.size() - 1] = 0d;
			}
			table.addDataRow(new DoubleArrayDataRow(data));
		}

		ExampleSet exampleSet = table.createExampleSet(label);

		IOContainer container = new IOContainer(new IOObject[]{exampleSet});

		return container;
	}
}

And, the stack trace of the error suggests that I have not connected my input data to the PCA port:

com.rapidminer.operator.UserError: No data was deliverd at port PCA.example set input. 
        at com.rapidminer.operator.ports.impl.AbstractPort.getData(AbstractPort.java:79)
        at com.rapidminer.operator.features.transformation.PCA.doWork(PCA.java:132)
        at com.rapidminer.operator.Operator.execute(Operator.java:768)
        at com.rapidminer.operator.execution.SimpleUnitExecutor.execute(SimpleUnitExecutor.java:51)
        at com.rapidminer.operator.ExecutionUnit.execute(ExecutionUnit.java:709)
        at com.rapidminer.operator.OperatorChain.doWork(OperatorChain.java:368)
        at com.rapidminer.operator.Operator.execute(Operator.java:768)
        at com.rapidminer.Process.run(Process.java:863)
        at com.rapidminer.Process.run(Process.java:770)
        at com.rapidminer.Process.run(Process.java:765)
        at thesis.PCA.RMTest.main(RMTest.java:39)

1. It is clear from the XML file above that I need to connect ports, but I cannot find examples of how to do this.
2. Nor can I find examples of how to retrieve any data once the operations have been performed.

Quite a lot of the examples in this forum seem to use code prior to RM5.

I'd appreciate any suggestions any one has.

Many thanks,

Jon

jingram · July 2010

Oh, and if any one would like to correct a typo, the message:

No data was deliverd at port PCA.example set input.

should read:

No data was delivered at port PCA.example set input.

"delivered" was spelled incorrectly

haddock · July 2010

G'Day!

And welcome to the house of shame

In your position I'd go for the white paper, unless loot is extra scarce. There was a big change between 4 and 5, namely, among a lot of other stuff, those ports. An alternative might be to look at the source of the extensions ( series, text, pmml etc..). Code for version 4 for laying down and avoiding I'd say.

Wish I had better newz!

jingram · July 2010

Hi haddock,

Thanks for your reply.

From reading the topic on the white paper, it suggests that it is mainly for extending RM - at this stage I don't need to extend it but just want to convert my XML file into the equivalent code. If the white paper does indeed contain such code for basic operations I would consider purchasing it, but not if it is mainly directed towards writing extensions.

I understand that this is only partly an open-source community, but I thought there might be some basic unit tests written in RM5 containing simple operations (similar to the wiki) which would eventually be used to update the wiki anyway.

If the source of the extensions (e.g. text) contains such code to connect operators, what would be a good package to look in?

Thanks again

P.S. this is the 2nd piece of software that I have used written in Germany and I must say the quality of both has amazed me!

haddock · July 2010

G'day Jon!

I wasn't really promoting the white paper as a guide for writing an application ( am I right in thinking Seb may produce one ? ), more as a quick way of understanding the classes, how they connect, and their required inputs. This latter point is a crucial difference between 4 and 5 - for example in 4 learner classes could be interrogated for their capabilities, now they use port conditions ( I had a Prolog parser which examined the classes in 4 for parallelism, and found this out the hard way! ).

If time is pressing and loot is not, I still think that you could figure how to make an application out of your XML by understanding the implications of its XML operators. However, if time is pressing and so is loot, then dive into the source of the extensions and see how you get on. Finally, if neither press, urge Seb to burst into print!

Happy coding!

jingram · July 2010

Howdy, Thanks again for your reply! I was lucky to find a small snippet in the forums http://rapid-i.com/rapidforum/index.php/topic,1641.msg6389.html#msg6389 that helped me to work out operater connectivity.

I am in the process of doing this in my code and very soon I'll post the code here so that it can be seen by others, but perhaps if it's good enough and a simple-enough example, it can add to the article on the wiki page.

Jon

haddock · July 2010

Nice one Jon, well done ;D

land · July 2010

Hi all,
yes, we are going to publish something for integrating RapidMiner. Might be I will start next week with writing it. But I would like to finish my other projects first...
One of this projects is to improve operator documentation and since we decided to use the wiki (and hopefully the help of some users) for this, it's useless to add any new content. The totally outdated content will be removed completely.

By the way: This software is totally open source and hence it is an open source community. The fact that we are working for a company and trying to earn our living costs from support and training does not mean that we are doing something unmoral like closed source products

Thank you for the kind words about the code quality. We are always trying to improve it...

Greetings,
Sebastian

jingram · July 2010

Hi,

As I mentioned, I have the operation connections working (well, it appears to be working but there's possibly a better or "best practices" way to do it). I have posted the entire Java source here in case it helps anyone else. One thing that I am struggling with is connecting my data to the first operator (or, perhaps to the root operator) - can anyone suggest how to do this? Currently, a dummy operator is generating useless data.

After tracing through source, I can show exactly where I think the problem is:

PerformPCA.java:

process.run(input);

Process.java:

if (input != null) {
	rootOperator.deliverInput(Arrays.asList(input.getIOObjects()));
}

ProcessRootOperator.java:

public void deliverInput(List<IOObject> inputs) {
	processInputExtender.deliver(inputs);		
}

OutputPortExtender.java:

public void deliver(List<IOObject> inputs) {
	int i = 0;
	for (OutputPort port : getManagedPorts()) {
	        if (port.isConnected()) {                                                          // I believe this line is the problem - but how do I connect this port so the input is delivered?
			if (i >= inputs.size()) {
				getPorts().getOwner().getOperator().getLogger().warning("Insufficient input for "+port.getSpec());
			} else {
				IOObject input = inputs.get(i);
				port.deliver(input);
				String name;
				if (input instanceof ResultObject) {
					name = ((ResultObject)input).getName();
				} else {
					name = input.getClass().getName();
				}
				name += " ("+input.getSource()+")";
				getPorts().getOwner().getOperator().getLogger().fine("Delivering " + name + " to " + port.getSpec());
			}
			i++;
		}
	}		
}

And here is my PerformPCA.java:

package thesis.rapidminer;

import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.tools.OperatorService;
import com.rapidminer.Process;
import com.rapidminer.RapidMiner;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DoubleArrayDataRow;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.ModelApplier;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.features.transformation.PCA;
import com.rapidminer.operator.generator.ExampleSetGenerator;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.operator.preprocessing.normalization.Normalization;
import com.rapidminer.parameter.Parameters;
import com.rapidminer.tools.Ontology;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

public class PerformPCA {

	public static void main(String[] args) {
		RapidMiner.init();

		// Create  the process
		Process process = createProcess();

		// Print process setup
		Logger.getLogger(PerformPCA.class.getName()).log(Level.INFO, process.getRootOperator().createProcessTree(0));

		// Create some input
		//IOContainer input = createInput();

		try {
			// Run the process
			process.run();
			//process.run(input);
		} catch (OperatorException ex) {
			Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
		}
	}

	/**
	 * Creates input from the PCA tutorial example set.
	 *
	 * @return The example set.
	 */
	private static IOContainer createInput() {
		ExampleSet exampleSet = createPCATutorialExampleSet();

		IOContainer container = new IOContainer(new IOObject[]{exampleSet});

		return container;
	}

	/**
	 * Generates an example set of data taken from Lindsay Smith's paper
	 * "A tutorial on Principal Components Analysis".
	 *
	 * @return An example set containing the data to reduce.
	 */
	private static ExampleSet createPCATutorialExampleSet() {
		List<Attribute> attributes = new LinkedList<Attribute>();

		attributes.add(AttributeFactory.createAttribute("First", Ontology.REAL));
		attributes.add(AttributeFactory.createAttribute("Second", Ontology.REAL));

		MemoryExampleTable table = new MemoryExampleTable(attributes);

		double[][] data = {
			{2.5, 2.4},
			{0.5, 0.7},
			{2.2, 2.9},
			{1.9, 2.2},
			{3.1, 3.0},
			{2.3, 2.7},
			{2, 1.6},
			{1, 1.1},
			{1.5, 1.6},
			{1.1, 0.9}
		};

		for (int i = 0; i < 10; ++i) {
			table.addDataRow(new DoubleArrayDataRow(data));
		}

		ExampleSet exampleSet = table.createExampleSet();

		return exampleSet;
	}

	/**
	 * Creates a process that performs PCA.
	 *
	 * The process performs the following sequence of operations:
	 *
	 * Generate Data (Generate Data)
	 * Normalize (Normalize)
	 * PCA (Principal Component Analysis)
	 * Apply Model (Apply Model)
	 *
	 * @return The created process.
	 */
	public static Process createProcess() {
		// Create a process
		Process process = new Process();

		// Create operator to create some example data
		Operator inputOperator = createInputOperator();

		// Create the normalise operator
		Operator normalisationOperator = createNormalisationOperator();

		OutputPort outputOutputPort = inputOperator.getOutputPorts().getPortByName("output");
		InputPort exampleSetInputPort = normalisationOperator.getInputPorts().getPortByName("example set input");

		// Connect "output" to "example set input"
		outputOutputPort.connectTo(exampleSetInputPort);

		// Create the PCA operator
		Operator pcaOperator = createPCAOperator();

		OutputPort exampleSetOutputOutputPort = normalisationOperator.getOutputPorts().getPortByName("example set output");
		exampleSetInputPort = pcaOperator.getInputPorts().getPortByName("example set input");

		// Connect "output" to "example set input"
		exampleSetOutputOutputPort.connectTo(exampleSetInputPort);

		// Create the model applier operator
		Operator modelApplierOperator = createModelApplierOperator();

		OutputPort originalOutputPort = pcaOperator.getOutputPorts().getPortByName("original");
		OutputPort preprocessingModelOutputPort = pcaOperator.getOutputPorts().getPortByName("preprocessing model");

		InputPort unlabelledDataInputPort = modelApplierOperator.getInputPorts().getPortByName("unlabelled data");
		InputPort modelInputPort = modelApplierOperator.getInputPorts().getPortByName("model");

		// Connect "original" to "unlabelled data"
		originalOutputPort.connectTo(unlabelledDataInputPort);
		// Connect "preprocessing model" to "model"
		preprocessingModelOutputPort.connectTo(modelInputPort);

		// Add the operators to the process
		process.getRootOperator().getSubprocess(0).addOperator(inputOperator);
		process.getRootOperator().getSubprocess(0).addOperator(normalisationOperator);
		process.getRootOperator().getSubprocess(0).addOperator(pcaOperator);
		process.getRootOperator().getSubprocess(0).addOperator(modelApplierOperator);

		return process;
	}

	/**
	 * Creates the example set generator.
	 *
	 * The example set is generated using the "sum classification" function.
	 *
	 * @return Null if the operator could not be created.
	 */
	private static Operator createInputOperator() {
		try {
			Operator inputOperator = OperatorService.createOperator(ExampleSetGenerator.class);

			// Set parameters
			inputOperator.setParameter(ExampleSetGenerator.PARAMETER_TARGET_FUNCTION, "sum classification");

			return inputOperator;
		} catch (OperatorCreationException ex) {
			Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
		}

		return null;
	}

	/**
	 * Creates the normalisation operator.
	 *
	 * The normalisation operator normalises the data between 0 and 1.
	 *
	 * @return Null if the operator could not be created.
	 */
	private static Operator createNormalisationOperator() {
		try {
			Operator normalisationOperator = OperatorService.createOperator(Normalization.class);

			// Set parameters
			// Normalize the data between 0 and 1
			normalisationOperator.setParameter(Normalization.PARAMETER_NORMALIZATION_METHOD, String.valueOf(Normalization.METHOD_RANGE_TRANSFORMATION));
			normalisationOperator.setParameter(Normalization.PARAMETER_MIN, "0");
			normalisationOperator.setParameter(Normalization.PARAMETER_MAX, "1");

			return normalisationOperator;
		} catch (OperatorCreationException ex) {
			Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
		}

		return null;
	}

	/**
	 * Creates the PCA operator.
	 *
	 * The PCA operator uses a variance threshold of 0.95.
	 *
	 * @return Null if the operator could not be created.
	 */
	private static Operator createPCAOperator() {
		try {
			Operator pcaOperator = OperatorService.createOperator(com.rapidminer.operator.features.transformation.PCA.class);

			// Set parameters
			// Maintain enough PCs to account for 95% of the variance in the data
			pcaOperator.setParameter(PCA.PARAMETER_VARIANCE_THRESHOLD, "0.95");

			return pcaOperator;
		} catch (OperatorCreationException ex) {
			Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
		}

		return null;
	}

	/**
	 * Creates the model applier operator.
	 *
	 * @return Null if the operator could not be created.
	 */
	private static Operator createModelApplierOperator() {
		try {
			Operator modelApplierOperator = OperatorService.createOperator(ModelApplier.class);

			// Set parameters
			modelApplierOperator.setParameter(ModelApplier.PARAMETER_APPLICATION_PARAMETERS, PCA.PARAMETER_VARIANCE_THRESHOLD + Parameters.PAIR_SEPARATOR + "0.95");

			return modelApplierOperator;
		} catch (OperatorCreationException ex) {
			Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
		}

		return null;
	}
}

Can anyone help? Thanks

EDIT: this isn't really tested as yet and I also believe I might need to connect my last operator (model applier) to a result (e.g. result 1, result 2) as in the XML file....hmmm?

jingram · July 2010

Firstly, sorry if it appears that I am double posting

, but I believe I have pretty much finished creating my PCA transformation and I'd like to share it for others if they need, but also if some of the source writers want to suggest that I am doing something the wrong way. In case anyone wants to know the method I used to solve my problems, I basically created the process I wanted in RM, then I created a program that loads a process from XML file and I debugged that program to see each case that the XMLBuilder used to build connections, etc.

The code prints out the results into XML format to System.out and when compared to the results in RM GUI, they match up.

Here is the code that successfully (no guarantee

) gives the same results as the PCA tutorial I used:

package thesis.rapidminer;

import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.tools.OperatorService;
import com.rapidminer.Process;
import com.rapidminer.RapidMiner;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DoubleArrayDataRow;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.ExecutionUnit;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.ModelApplier;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.features.transformation.PCA;
import com.rapidminer.operator.generator.ExampleSetGenerator;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.operator.preprocessing.normalization.Normalization;
import com.rapidminer.parameter.Parameters;
import com.rapidminer.tools.Ontology;
import java.io.File;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

public class PerformPCA {

	/**
	 * Determines whether the data should be normalised before applying PCA.
	 */
	public static boolean normalise = false;
	/**
	 * The location and name to save the process XML file.
	 */
	public static String processFilePathname = "PCATutorial-code/processes/process.xml";

	/**
	 * Performs PCA on the hard-coded data based on Lindsay Smith's paper.
	 * 
	 * @param args The command line arguments.
	 */
	public static void main(String[] args) {
		RapidMiner.init();

		// Create  the process
		Process process = createProcess();

		// Print process setup
		Logger.getLogger(PerformPCA.class.getName()).log(Level.INFO, process.getRootOperator().createProcessTree(0));

		// Create some input
		IOContainer input = createInput();

		try {
			// Run the process
			process.run(input);

			// Save the process to XML file
			try {
				process.save(new File(processFilePathname));
			} catch (IOException ex) {
				Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
			}

			// Print the data at the output port "result 1"
			try {
				InputPort result1InputPort = process.getRootOperator().getSubprocess(0).getInnerSinks().getPortByName("result 1");

				try {
					IOObject data = result1InputPort.getData();

					try {
						data.write(System.out);
					} catch (IOException ex) {
						Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
					}
				} catch (UserError ex) {
					Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
				}
			} catch (OperatorException ex) {
				Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
			}

			// Print the data at the output port "result 2"
			try {
				InputPort result2InputPort = process.getRootOperator().getSubprocess(0).getInnerSinks().getPortByName("result 2");

				try {
					IOObject data = result2InputPort.getData();

					try {
						data.write(System.out);
					} catch (IOException ex) {
						Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
					}
				} catch (UserError ex) {
					Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
				}
			} catch (OperatorException ex) {
				Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
			}
		} catch (OperatorException ex) {
			Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
		}
	}

	/**
	 * Creates input from the PCA tutorial example set.
	 *
	 * @return The example set.
	 */
	private static IOContainer createInput() {
		ExampleSet exampleSet = createPCATutorialExampleSet();

		IOContainer container = new IOContainer(new IOObject[]{exampleSet});

		return container;
	}

	/**
	 * Generates an example set of data taken from Lindsay Smith's paper
	 * "A tutorial on Principal Components Analysis".
	 *
	 * @return An example set containing the data to reduce.
	 */
	private static ExampleSet createPCATutorialExampleSet() {
		List<Attribute> attributes = new LinkedList<Attribute>();

		attributes.add(AttributeFactory.createAttribute("First", Ontology.REAL));
		attributes.add(AttributeFactory.createAttribute("Second", Ontology.REAL));

		MemoryExampleTable table = new MemoryExampleTable(attributes);

		double[][] data = {
			{2.5, 2.4},
			{0.5, 0.7},
			{2.2, 2.9},
			{1.9, 2.2},
			{3.1, 3.0},
			{2.3, 2.7},
			{2, 1.6},
			{1, 1.1},
			{1.5, 1.6},
			{1.1, 0.9}
		};

		for (int i = 0; i < 10; ++i) {
			table.addDataRow(new DoubleArrayDataRow(data));
		}

		ExampleSet exampleSet = table.createExampleSet();

		return exampleSet;
	}

	/**
	 * Creates a process that performs PCA.
	 *
	 * The process performs the following sequence of operations:
	 *
	 * Generate Data (Generate Data)
	 * ? Normalize (Normalize) - only if the normalise variable is true
	 * PCA (Principal Component Analysis)
	 * Apply Model (Apply Model)
	 *
	 * @return The created process.
	 */
	public static Process createProcess() {
		// Create a process
		Process process = new Process();

		ExecutionUnit executionUnit = process.getRootOperator().getSubprocess(0);

		OutputPort input1OutputPort = executionUnit.getInnerSources().getPortByName("input 1");

		Operator normalisationOperator = null;

		if (normalise) {
			// Create the normalise operator
			normalisationOperator = createNormalisationOperator();

			// Add the operator to the process
			executionUnit.addOperator(normalisationOperator);

			InputPort exampleSetInputPort = normalisationOperator.getInputPorts().getPortByName("example set input");

			// Connect "input 1" to "example set input"
			input1OutputPort.connectTo(exampleSetInputPort);
		}

		// Create the PCA operator
		Operator pcaOperator = createPCAOperator();

		// Add the operator to the process
		executionUnit.addOperator(pcaOperator);

		InputPort exampleSetInputPort = pcaOperator.getInputPorts().getPortByName("example set input");

		if (normalise) {
			OutputPort exampleSetOutputOutputPort = normalisationOperator.getOutputPorts().getPortByName("example set output");

			// Connect "output" to "example set input"
			exampleSetOutputOutputPort.connectTo(exampleSetInputPort);
		} else {
			// Connect "input 1" to "example set input"
			input1OutputPort.connectTo(exampleSetInputPort);
		}

		// Create the model applier operator
		Operator modelApplierOperator = createModelApplierOperator();

		// Add the operator to the process
		executionUnit.addOperator(modelApplierOperator);

		OutputPort originalOutputPort = pcaOperator.getOutputPorts().getPortByName("original");
		OutputPort preprocessingModelOutputPort = pcaOperator.getOutputPorts().getPortByName("preprocessing model");

		InputPort unlabelledDataInputPort = modelApplierOperator.getInputPorts().getPortByName("unlabelled data");
		InputPort modelInputPort = modelApplierOperator.getInputPorts().getPortByName("model");

		// Connect "original" to "unlabelled data"
		originalOutputPort.connectTo(unlabelledDataInputPort);
		// Connect "preprocessing model" to "model"
		preprocessingModelOutputPort.connectTo(modelInputPort);

		OutputPort labelledDataOutputPort = modelApplierOperator.getOutputPorts().getPortByName("labelled data");
		OutputPort modelOutputPort = modelApplierOperator.getOutputPorts().getPortByName("model");

		InputPort result1InputPort = executionUnit.getInnerSinks().getPortByName("result 1");
		InputPort result2InputPort = executionUnit.getInnerSinks().getPortByName("result 2");

		// Connect "labelled data" to "result 1"
		labelledDataOutputPort.connectTo(result1InputPort);
		// Connect "model" to "result 2"
		modelOutputPort.connectTo(result2InputPort);

		return process;
	}

	/**
	 * Creates the example set generator.
	 *
	 * The example set is generated using the "polynomial" function.
	 *
	 * @return Null if the operator could not be created.
	 */
	private static Operator createInputOperator() {
		try {
			Operator inputOperator = OperatorService.createOperator(ExampleSetGenerator.class);

			// Set parameters
			inputOperator.setParameter(ExampleSetGenerator.PARAMETER_TARGET_FUNCTION, "polynomial");

			return inputOperator;
		} catch (OperatorCreationException ex) {
			Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
		}

		return null;
	}

	/**
	 * Creates the normalisation operator.
	 *
	 * The normalisation operator normalises the data between 0 and 1.
	 *
	 * @return Null if the operator could not be created.
	 */
	private static Operator createNormalisationOperator() {
		try {
			Operator normalisationOperator = OperatorService.createOperator(Normalization.class);

			// Set parameters
			// Normalize the data between 0 and 1
			normalisationOperator.setParameter(Normalization.PARAMETER_NORMALIZATION_METHOD, String.valueOf(Normalization.METHOD_RANGE_TRANSFORMATION));
			normalisationOperator.setParameter(Normalization.PARAMETER_MIN, "0");
			normalisationOperator.setParameter(Normalization.PARAMETER_MAX, "1");

			return normalisationOperator;
		} catch (OperatorCreationException ex) {
			Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
		}

		return null;
	}

	/**
	 * Creates the PCA operator.
	 *
	 * The PCA operator uses a variance threshold of 0.95.
	 *
	 * @return Null if the operator could not be created.
	 */
	private static Operator createPCAOperator() {
		try {
			Operator pcaOperator = OperatorService.createOperator(com.rapidminer.operator.features.transformation.PCA.class);

			// Set parameters
			// Maintain enough PCs to account for 95% of the variance in the data
			pcaOperator.setParameter(PCA.PARAMETER_VARIANCE_THRESHOLD, "0.95");

			return pcaOperator;
		} catch (OperatorCreationException ex) {
			Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
		}

		return null;
	}

	/**
	 * Creates the model applier operator.
	 *
	 * @return Null if the operator could not be created.
	 */
	private static Operator createModelApplierOperator() {
		try {
			Operator modelApplierOperator = OperatorService.createOperator(ModelApplier.class);

			// Set parameters
			modelApplierOperator.setParameter(ModelApplier.PARAMETER_APPLICATION_PARAMETERS, PCA.PARAMETER_VARIANCE_THRESHOLD + Parameters.PAIR_SEPARATOR + "0.95");

			return modelApplierOperator;
		} catch (OperatorCreationException ex) {
			Logger.getLogger(PerformPCA.class.getName()).log(Level.SEVERE, null, ex);
		}

		return null;
	}
}

Now onto adding in more operators (outlier detection) and adding some logic for my application to use PCA or ICA or SPCA (a special form of PCA called Simple PCA) etc. I hope I don't have too much trouble writing a custom Operator for SPCA :P

EDIT: it won't print "result 2" because after doing "write" the first time, it actually closes System.out (because "write" closes the OutputStream) :S - I am not sure if this is the intended behaviour.

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

"RM 508 Simple Operators (Data Transformation using PCA)"

Answers