"[SOLVED] TF IDF and clustering in the same data set"

zwan · April 2012

Hello,
I am trying to process documents to retrieve :

the wordlist and their TF IDF for each document
and their clustering information (K means)

.

I can say I managed to get some result but I want to store these 2 result sets in a database (MySQL) which I can do easily.
But my concern is how to make a link between my two tables: worldlisttable and metatable where the cluster information is.
Using transpose rename all my attributes (att_1, att_2, ...).
How to know which file is related to a certain att_xx ? Is there a way to change this? any idea is welcome.
To be more clear see the process here:

The XML is here:


<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.017">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.017" expanded="true" name="Process">
    <process expanded="true" height="431" width="748">
      <operator activated="true" class="text:process_document_from_file" compatibility="5.1.004" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
        <list key="text_directories">
          <parameter key="text" value="D:\RapidMinerRep\sample docs"/>
        </list>
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prune_below_absolute" value="2"/>
        <parameter key="prune_above_absolute" value="9999"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.05"/>
        <process expanded="true" height="370" width="617">
          <operator activated="true" class="text:tokenize" compatibility="5.1.004" expanded="true" height="60" name="Tokenize" width="90" x="51" y="29"/>
          <operator activated="true" class="text:transform_cases" compatibility="5.1.004" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="210"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.1.004" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="45" y="300"/>
          <operator activated="true" class="text:stem_porter" compatibility="5.1.004" expanded="true" height="60" name="Stem (Porter)" width="90" x="179" y="300"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
          <connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="multiply" compatibility="5.1.017" expanded="true" height="94" name="Multiply" width="90" x="179" y="30"/>
      <operator activated="true" class="k_means" compatibility="5.1.017" expanded="true" height="76" name="Clustering" width="90" x="313" y="30">
        <parameter key="k" value="5"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.1.017" expanded="true" height="76" name="Select Attributes" width="90" x="447" y="30">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="cluster|id|label|metadata_date|metadata_file|metadata_path|"/>
      </operator>
      <operator activated="false" class="write_database" compatibility="5.1.017" expanded="true" height="60" name="Write Database" width="90" x="648" y="255">
        <parameter key="connection" value="localhost"/>
        <parameter key="table_name" value="tableAprilMeta"/>
      </operator>
      <operator activated="true" class="transpose" compatibility="5.1.017" expanded="true" height="76" name="Transpose" width="90" x="45" y="300"/>
      <operator activated="true" class="generate_copy" compatibility="5.1.017" expanded="true" height="76" name="Generate Copy" width="90" x="313" y="300">
        <parameter key="attribute_name" value="id"/>
        <parameter key="new_name" value="keyword"/>
      </operator>
      <operator activated="true" class="generate_id" compatibility="5.1.017" expanded="true" height="76" name="Generate ID" width="90" x="447" y="300"/>
      <connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="Clustering" to_port="example set"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Transpose" to_port="example set input"/>
      <connect from_op="Clustering" from_port="clustered set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_port="result 2"/>
      <connect from_op="Transpose" from_port="example set output" to_op="Generate Copy" to_port="example set input"/>
      <connect from_op="Generate Copy" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
      <connect from_op="Generate ID" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Thank you

MariusHelf · April 2012

Why do you want to transpose the data at all? If you simply apply the clustering without Select Attributes afterwards, you have both the clustering information and the word vectors in one example set.

Best, Marius

zwan · April 2012

Hi the reason I do that is that I want to the result in a MySQL Database. if there is an alternative way to directly, that would be awesome.

MariusHelf · April 2012

Did you try the Write Database Operator?

zwan · April 2012

Hello, this operator won't work because the ResultSet has more columns than MySQL supports for a table.
That's the reason I am using the Transpose operator. But then I lose the meta information (clustering, filename,...)
Is there a way to use Transpose ans still keep all the information in the ResultSet?

Thank you

MariusHelf · April 2012

Then you can store the transposed example set in the database, and after reloading it transpose it back. You'll end up with the original data set. You only have to re-set the roles and maybe adjust the column types. Please have a look at the attached process.

Best, Marius

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.003" expanded="true" name="Process">
    <process expanded="true" height="296" width="1016">
      <operator activated="true" class="text:create_document" compatibility="5.2.001" expanded="true" height="60" name="Create Document" width="90" x="45" y="30">
        <parameter key="text" value="an example text"/>
      </operator>
      <operator activated="true" class="text:create_document" compatibility="5.2.001" expanded="true" height="60" name="Create Document (2)" width="90" x="45" y="120">
        <parameter key="text" value="another example text"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="5.2.001" expanded="true" height="112" name="Process Documents" width="90" x="246" y="30">
        <process expanded="true" height="500" width="950">
          <operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize" width="90" x="246" y="30"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="k_means" compatibility="5.2.003" expanded="true" height="76" name="Clustering" width="90" x="365" y="30"/>
      <operator activated="true" class="transpose" compatibility="5.2.003" expanded="true" height="76" name="Transpose" width="90" x="447" y="210"/>
      <operator activated="true" class="transpose" compatibility="5.2.003" expanded="true" height="76" name="Transpose (2)" width="90" x="581" y="30"/>
      <operator activated="true" class="set_role" compatibility="5.2.003" expanded="true" height="76" name="Set Role" width="90" x="715" y="30">
        <parameter key="name" value="cluster"/>
        <parameter key="target_role" value="cluster"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="guess_types" compatibility="5.2.003" expanded="true" height="76" name="Guess Types" width="90" x="849" y="30"/>
      <connect from_op="Create Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Create Document (2)" from_port="output" to_op="Process Documents" to_port="documents 2"/>
      <connect from_op="Process Documents" from_port="example set" to_op="Clustering" to_port="example set"/>
      <connect from_op="Clustering" from_port="clustered set" to_op="Transpose" to_port="example set input"/>
      <connect from_op="Transpose" from_port="example set output" to_op="Transpose (2)" to_port="example set input"/>
      <connect from_op="Transpose" from_port="original" to_port="result 2"/>
      <connect from_op="Transpose (2)" from_port="example set output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Guess Types" to_port="example set input"/>
      <connect from_op="Guess Types" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

zwan · April 2012

Hello, thank you for your help. I really appreciate that. I am right now on a leave and can't try the process you gave right now. But will do asap.
The other IMPORTANT thing is that the data stored is expected to be exploited by a very different system (coded in Java/MySQL) so I am not sure about how to load it and restore the original data set.
Thank you, zwan

zwan · April 2012

Hello, thank you Marius!
I tried the process by adding the write database but still it doesnt save the information about the clusters and the filename in the database.
Isn't there a way to get these meta information in MySQL table (along with all the other TF IDF result set)?

Thank you

MariusHelf · April 2012

Hi,

first of all, please update to the latest version of RapidMiner. Then try this process, it certainlly does save the cluster information. This specific process does not store filenames, since the documents are not read from file.

Best, Marius

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.003" expanded="true" name="Process">
    <process expanded="true" height="519" width="768">
      <operator activated="true" class="text:create_document" compatibility="5.2.001" expanded="true" height="60" name="Create Document" width="90" x="45" y="30">
        <parameter key="text" value="an example text"/>
      </operator>
      <operator activated="true" class="text:create_document" compatibility="5.2.001" expanded="true" height="60" name="Create Document (2)" width="90" x="45" y="120">
        <parameter key="text" value="another example text"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="5.2.001" expanded="true" height="112" name="Process Documents" width="90" x="246" y="30">
        <process expanded="true" height="500" width="950">
          <operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize" width="90" x="246" y="30"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="k_means" compatibility="5.2.003" expanded="true" height="76" name="Clustering" width="90" x="365" y="30"/>
      <operator activated="true" class="transpose" compatibility="5.2.003" expanded="true" height="76" name="Transpose" width="90" x="514" y="30"/>
      <operator activated="true" class="write_database" compatibility="5.2.003" expanded="true" height="60" name="Write Database" width="90" x="648" y="30">
        <parameter key="connection" value="localhost - test"/>
        <parameter key="table_name" value="test"/>
        <parameter key="overwrite_mode" value="overwrite"/>
      </operator>
      <operator activated="true" class="read_database" compatibility="5.2.003" expanded="true" height="60" name="Read Database" width="90" x="45" y="300">
        <parameter key="connection" value="localhost - test"/>
        <parameter key="define_query" value="table name"/>
        <parameter key="table_name" value="test"/>
        <enumeration key="parameters"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="5.2.003" expanded="true" height="76" name="Set Role (2)" width="90" x="179" y="300">
        <parameter key="name" value="id"/>
        <parameter key="target_role" value="id"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="transpose" compatibility="5.2.003" expanded="true" height="76" name="Transpose (2)" width="90" x="313" y="300"/>
      <operator activated="true" class="set_role" compatibility="5.2.003" expanded="true" height="76" name="Set Role" width="90" x="447" y="300">
        <parameter key="name" value="cluster"/>
        <parameter key="target_role" value="cluster"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="guess_types" compatibility="5.2.003" expanded="true" height="76" name="Guess Types" width="90" x="581" y="300"/>
      <connect from_op="Create Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Create Document (2)" from_port="output" to_op="Process Documents" to_port="documents 2"/>
      <connect from_op="Process Documents" from_port="example set" to_op="Clustering" to_port="example set"/>
      <connect from_op="Clustering" from_port="clustered set" to_op="Transpose" to_port="example set input"/>
      <connect from_op="Transpose" from_port="example set output" to_op="Write Database" to_port="input"/>
      <connect from_op="Transpose" from_port="original" to_port="result 2"/>
      <connect from_op="Read Database" from_port="output" to_op="Set Role (2)" to_port="example set input"/>
      <connect from_op="Set Role (2)" from_port="example set output" to_op="Transpose (2)" to_port="example set input"/>
      <connect from_op="Transpose (2)" from_port="example set output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Guess Types" to_port="example set input"/>
      <connect from_op="Guess Types" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="270"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

"[SOLVED] TF IDF and clustering in the same data set"

Answers