The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
text mining - all records placed in one cluster
Diana_Wegner
Member Posts: 4 Contributor I
I have a follow-up question to my previous text mining issue. K-means, like many of the other clustering modules, places all of my 3000 records in one cluster. I've tried different parameters with no luck. Do you have any hints to resolve this issue? Random Clustering is the only one that generates the number of clusters requested, however the results are not what I expected.
Here is the xml... THANKS!!
on="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
<context>
<input/>
<output>
<location>//Local Repository/Result 1 Process Document Cluster</location>
<location>//Local Repository/Result 2 clustering</location>
<location>//Local Repository/Result 3</location>
</output>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="5.3.013" expanded="true" height="60" name="Read CSV" width="90" x="45" y="255">
<parameter key="csv_file" value="C:\Users\lzd3rc\Documents\ADM - Project Files\CrowdSourcing\data mining clustingering trustworthiness\innovation network\AQA Miner\rapidminer\GATS Input File 8 rapidminder.csv"/>
<parameter key="column_separators" value=","/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="encoding" value="windows-1252"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="UNIQID.true.text.label"/>
<parameter key="1" value="EXP1.true.text.attribute"/>
</list>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="246" y="255">
<parameter key="vector_creation" value="Term Frequency"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="9999"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (2)" width="90" x="179" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="313" y="30"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="75">
<parameter key="min_chars" value="2"/>
<parameter key="max_chars" value="99999"/>
</operator>
<connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
<connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="dbscan" compatibility="5.3.013" expanded="true" height="76" name="Clustering" width="90" x="246" y="30"/>
<operator activated="true" class="write_as_text" compatibility="5.3.013" expanded="true" height="76" name="Write as Text" width="90" x="380" y="30">
<parameter key="result_file" value="C:\Users\lzd3rc\Documents\ADM - Project Files\CrowdSourcing\data mining clustingering trustworthiness\innovation network\AQA Miner\rapidminer\output rapidminer"/>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Clustering" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
<connect from_op="Clustering" from_port="cluster model" to_op="Write as Text" to_port="input 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Here is the xml... THANKS!!
on="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
<context>
<input/>
<output>
<location>//Local Repository/Result 1 Process Document Cluster</location>
<location>//Local Repository/Result 2 clustering</location>
<location>//Local Repository/Result 3</location>
</output>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="5.3.013" expanded="true" height="60" name="Read CSV" width="90" x="45" y="255">
<parameter key="csv_file" value="C:\Users\lzd3rc\Documents\ADM - Project Files\CrowdSourcing\data mining clustingering trustworthiness\innovation network\AQA Miner\rapidminer\GATS Input File 8 rapidminder.csv"/>
<parameter key="column_separators" value=","/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="encoding" value="windows-1252"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="UNIQID.true.text.label"/>
<parameter key="1" value="EXP1.true.text.attribute"/>
</list>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="246" y="255">
<parameter key="vector_creation" value="Term Frequency"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="9999"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (2)" width="90" x="179" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="313" y="30"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="75">
<parameter key="min_chars" value="2"/>
<parameter key="max_chars" value="99999"/>
</operator>
<connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
<connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="dbscan" compatibility="5.3.013" expanded="true" height="76" name="Clustering" width="90" x="246" y="30"/>
<operator activated="true" class="write_as_text" compatibility="5.3.013" expanded="true" height="76" name="Write as Text" width="90" x="380" y="30">
<parameter key="result_file" value="C:\Users\lzd3rc\Documents\ADM - Project Files\CrowdSourcing\data mining clustingering trustworthiness\innovation network\AQA Miner\rapidminer\output rapidminer"/>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Clustering" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
<connect from_op="Clustering" from_port="cluster model" to_op="Write as Text" to_port="input 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0
Answers
I'm no expert, but anyway:
The process shows a dbscan clustering, not k-means. What parameters did you try?
Looks like minpoints and/or epsilon are either too high (= all noise) or too low (= to
dbscan it seems that all documents are near enough to fit into one single cluster).
You can use the 'data to similarity' operator to calculate the similarities between
each document - this might give you a feel for your measure and the right epsilon.
Try a lower epsilon (let's say 0.1 or 0.01). What do you get, now?
There's an example process that iterates over different parameters for DBScan here
http://rapidminernotes.blogspot.co.uk/2010/12/counting-clusters.html.
regards
Andrew
I tested a number of the models and should have switched back to kmeans before I copied the code.