Wrong results
I have a set of news items (XML format) concerning the following categories (in Dutch): Auto, Economie, Politiek, Sport.
These XML items are read with the Read XML operator, resulting in an example set with Categorie as label attribute and Text and Title as regular attributes.
I apply Naive Bayes, Cross Validation and Performance operator and get funny performance results.
The imported XML content is classified by humans and should be accurate.
So what is going wrong? It looks like if I make a systematical error in my approach.
If I replace Bayes by k-NN, it gives the same performance results.
Who has some clues to resolve this?
I have attached the RM process and the XML data in a zip file.
Best Answer
-
Thomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn
@AKO Yes, you are getting terrible results. You are not even Text Processing the data or cleaning it up to extract content. My suggestions is to install the Text Processing and Web Mining extension, then troll through the Community for some Text Processing posts and processes.
By doing some basic Text Processing I increased your accuracy and recall, so that's where you need to focus.
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="advanced_file_connectors:read_xml" compatibility="8.0.001" expanded="true" height="68" name="Read XML" width="90" x="45" y="34">
<parameter key="file" value="C:\Users\Thomas Ott\Desktop\Training_items_RapMin.xml"/>
<parameter key="xpath_for_examples" value="//root/site/page/children/page/children/page"/>
<enumeration key="xpaths_for_attributes">
<parameter key="xpath_for_attribute" value="title[1]/text()"/>
<parameter key="xpath_for_attribute" value="body[1]/text()"/>
<parameter key="xpath_for_attribute" value="categorie[1]/text()"/>
</enumeration>
<list key="namespaces"/>
<parameter key="use_default_namespace" value="false"/>
<parameter key="parse_numbers" value="false"/>
<list key="annotations"/>
<parameter key="locale" value="Dutch"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="title[1]/text().true.polynominal.attribute"/>
<parameter key="1" value="body[1]/text().true.polynominal.attribute"/>
<parameter key="2" value="categorie[1]/text().true.polynominal.attribute"/>
</list>
</operator>
<operator activated="true" class="rename" compatibility="8.0.001" expanded="true" height="82" name="Rename" width="90" x="179" y="34">
<parameter key="old_name" value="body[1]/text()"/>
<parameter key="new_name" value="Text"/>
<list key="rename_additional_attributes">
<parameter key="categorie[1]/text()" value="Categorie"/>
<parameter key="title[1]/text()" value="Titel"/>
</list>
</operator>
<operator activated="true" breakpoints="after" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role" width="90" x="313" y="136">
<parameter key="attribute_name" value="Categorie"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles">
<parameter key="Text" value="regular"/>
<parameter key="Titel" value="regular"/>
</list>
</operator>
<operator activated="true" class="web:unescape_html_attribute" compatibility="7.3.000" expanded="true" height="82" name="Unescape HTML" width="90" x="447" y="136">
<parameter key="attribute" value="Text"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="8.0.001" expanded="true" height="82" name="Nominal to Text" width="90" x="581" y="34">
<parameter key="attribute" value="Titel"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="715" y="34">
<parameter key="prune_method" value="percentual"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="7.5.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="313" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="concurrency:cross_validation" compatibility="8.0.001" expanded="true" height="145" name="Cross Validation" width="90" x="849" y="34">
<process expanded="true">
<operator activated="true" class="h2o:deep_learning" compatibility="7.6.001" expanded="true" height="82" name="Deep Learning" width="90" x="223" y="34">
<enumeration key="hidden_layer_sizes">
<parameter key="hidden_layer_sizes" value="50"/>
<parameter key="hidden_layer_sizes" value="50"/>
</enumeration>
<enumeration key="hidden_dropout_ratios"/>
<list key="expert_parameters"/>
<list key="expert_parameters_"/>
</operator>
<connect from_port="training set" to_op="Deep Learning" to_port="training set"/>
<connect from_op="Deep Learning" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model" width="90" x="112" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="8.0.001" expanded="true" height="82" name="Performance" width="90" x="246" y="34">
<parameter key="classification_error" value="true"/>
<parameter key="absolute_error" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="performance 1"/>
<connect from_op="Performance" from_port="example set" to_port="test set results"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read XML" from_port="output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Unescape HTML" to_port="example set input"/>
<connect from_op="Unescape HTML" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Cross Validation" to_port="example set"/>
<connect from_op="Cross Validation" from_port="example set" to_port="result 1"/>
<connect from_op="Cross Validation" from_port="performance 1" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>2
Answers
@thomas: thank you very much. I see now that I forgot to clean the data thoroughly cleaning the data. Thanks!