The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
"[SOLVED] TF IDF and clustering in the same data set"
Hello,
I am trying to process documents to retrieve :
I can say I managed to get some result but I want to store these 2 result sets in a database (MySQL) which I can do easily.
But my concern is how to make a link between my two tables: worldlisttable and metatable where the cluster information is.
Using transpose rename all my attributes (att_1, att_2, ...).
How to know which file is related to a certain att_xx ? Is there a way to change this? any idea is welcome.
To be more clear see the process here:
The XML is here:
I am trying to process documents to retrieve :
- the wordlist and their TF IDF for each document
- and their clustering information (K means)
I can say I managed to get some result but I want to store these 2 result sets in a database (MySQL) which I can do easily.
But my concern is how to make a link between my two tables: worldlisttable and metatable where the cluster information is.
Using transpose rename all my attributes (att_1, att_2, ...).
How to know which file is related to a certain att_xx ? Is there a way to change this? any idea is welcome.
To be more clear see the process here:
The XML is here:
Thank you
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.017">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.017" expanded="true" name="Process">
<process expanded="true" height="431" width="748">
<operator activated="true" class="text:process_document_from_file" compatibility="5.1.004" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
<list key="text_directories">
<parameter key="text" value="D:\RapidMinerRep\sample docs"/>
</list>
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="9999"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.05"/>
<process expanded="true" height="370" width="617">
<operator activated="true" class="text:tokenize" compatibility="5.1.004" expanded="true" height="60" name="Tokenize" width="90" x="51" y="29"/>
<operator activated="true" class="text:transform_cases" compatibility="5.1.004" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="210"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.1.004" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="45" y="300"/>
<operator activated="true" class="text:stem_porter" compatibility="5.1.004" expanded="true" height="60" name="Stem (Porter)" width="90" x="179" y="300"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="multiply" compatibility="5.1.017" expanded="true" height="94" name="Multiply" width="90" x="179" y="30"/>
<operator activated="true" class="k_means" compatibility="5.1.017" expanded="true" height="76" name="Clustering" width="90" x="313" y="30">
<parameter key="k" value="5"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.017" expanded="true" height="76" name="Select Attributes" width="90" x="447" y="30">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="cluster|id|label|metadata_date|metadata_file|metadata_path|"/>
</operator>
<operator activated="false" class="write_database" compatibility="5.1.017" expanded="true" height="60" name="Write Database" width="90" x="648" y="255">
<parameter key="connection" value="localhost"/>
<parameter key="table_name" value="tableAprilMeta"/>
</operator>
<operator activated="true" class="transpose" compatibility="5.1.017" expanded="true" height="76" name="Transpose" width="90" x="45" y="300"/>
<operator activated="true" class="generate_copy" compatibility="5.1.017" expanded="true" height="76" name="Generate Copy" width="90" x="313" y="300">
<parameter key="attribute_name" value="id"/>
<parameter key="new_name" value="keyword"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.1.017" expanded="true" height="76" name="Generate ID" width="90" x="447" y="300"/>
<connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Clustering" to_port="example set"/>
<connect from_op="Multiply" from_port="output 2" to_op="Transpose" to_port="example set input"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_port="result 2"/>
<connect from_op="Transpose" from_port="example set output" to_op="Generate Copy" to_port="example set input"/>
<connect from_op="Generate Copy" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0
Answers
Best, Marius
That's the reason I am using the Transpose operator. But then I lose the meta information (clustering, filename,...)
Is there a way to use Transpose ans still keep all the information in the ResultSet?
Thank you
Best, Marius
The other IMPORTANT thing is that the data stored is expected to be exploited by a very different system (coded in Java/MySQL) so I am not sure about how to load it and restore the original data set.
Thank you, zwan
I tried the process by adding the write database but still it doesnt save the information about the clusters and the filename in the database.
Isn't there a way to get these meta information in MySQL table (along with all the other TF IDF result set)?
Thank you
first of all, please update to the latest version of RapidMiner. Then try this process, it certainlly does save the cluster information. This specific process does not store filenames, since the documents are not read from file.
Best, Marius