"Expectation Mazimization Clustering very, very slow."

Nick595 · September 2015

Hi all, 

I'm learning more about Expectation Maximization Clustering, which I believe could be very helpful for my thesis. I have a dataset that contains a few hundred of reviews. I want to discover hidden topics within these reviews, and see the probability that a review belongs to cluster 1, cluster 2, etc. However, when trying to stem and tokenize the data, and then clustering, the process is taking hours. I have 8GB available, but after 3 hours there is still no result. 

My process is below

 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.5.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="6.5.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="read_excel" compatibility="6.5.001" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
        <parameter key="excel_file" value="C:\Users\Nick\Documents\Thesis DataSets\AudioSentenceReview2.xlsx"/>
        <parameter key="imported_cell_range" value="B1:B1337"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Sentence .true.text.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="6.5.001" expanded="true" height="76" name="Nominal to Text" width="90" x="112" y="120"/>
      <operator activated="true" class="text:process_document_from_data" compatibility="6.5.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="246" y="210">
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_method" value="absolute"/>
        <parameter key="prune_below_absolute" value="2"/>
        <parameter key="prune_above_absolute" value="999"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="6.5.000" expanded="true" height="60" name="Tokenize" width="90" x="112" y="30"/>
          <operator activated="true" class="text:transform_cases" compatibility="6.5.000" expanded="true" height="60" name="Transform Cases" width="90" x="246" y="30"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="6.5.000" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="447" y="120"/>
          <operator activated="true" class="text:stem_snowball" compatibility="6.5.000" expanded="true" height="60" name="Stem (Snowball)" width="90" x="447" y="255"/>
          <operator activated="true" class="text:filter_by_length" compatibility="6.5.000" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="514" y="30">
            <parameter key="min_chars" value="3"/>
            <parameter key="max_chars" value="99"/>
          </operator>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="expectation_maximization_clustering" compatibility="6.5.001" expanded="true" height="76" name="Clustering (2)" width="90" x="380" y="120"/>
      <operator activated="true" class="write_excel" compatibility="6.5.001" expanded="true" height="76" name="Write Excel" width="90" x="514" y="255">
        <parameter key="excel_file" value="C:\Users\Nick\Documents\excelbestand 1.xlsx"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Clustering (2)" to_port="example set"/>
      <connect from_op="Clustering (2)" from_port="cluster model" to_port="result 1"/>
      <connect from_op="Clustering (2)" from_port="clustered set" to_op="Write Excel" to_port="input"/>
      <connect from_op="Write Excel" from_port="through" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Could anyone explain to me what im doing wrong here?

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

"Expectation Mazimization Clustering very, very slow."