The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
Extract topics from data (LDA)
Hi
Via Process documents from files (subprocess: transform cases > tokenize > filter stopwords > filter tokens by length) I have created an example set based on 6 documents where I want to extract the topics from via the Extract topics from Data (DLA) operator and everything works just fine but I still have words of less than 4 tokens emerging in the wordlist that belong to the topics, as well as stopwords (the, and...). Does anyone knows what else I can do to solve this? I already used the filter stopwords and filter tokens by length (4 - 25) operator in the first step but I'm apparently doing something wrong since I still have those meaningless words in the topic list. @mschmitz can you maybe help?
This the XML file - Thank you very much!
Via Process documents from files (subprocess: transform cases > tokenize > filter stopwords > filter tokens by length) I have created an example set based on 6 documents where I want to extract the topics from via the Extract topics from Data (DLA) operator and everything works just fine but I still have words of less than 4 tokens emerging in the wordlist that belong to the topics, as well as stopwords (the, and...). Does anyone knows what else I can do to solve this? I already used the filter stopwords and filter tokens by length (4 - 25) operator in the first step but I'm apparently doing something wrong since I still have those meaningless words in the topic list. @mschmitz can you maybe help?
This the XML file - Thank you very much!
<?xml version="1.0" encoding="UTF-8"?><process version="9.6.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.6.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="9.3.001" expanded="true" height="82" name="Process Documents from Files" width="90" x="112" y="34">
<list key="text_directories">
<parameter key="MembersCoops" value="F:\2019-2020 Thesis MBA\Bylaws Coops\Labour-members"/>
</list>
<parameter key="file_pattern" value="*"/>
<parameter key="extract_text_only" value="true"/>
<parameter key="use_file_extension_as_type" value="true"/>
<parameter key="content_type" value="txt"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_absolute" value="3"/>
<parameter key="prune_above_absolute" value="9999"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="data_management" value="auto"/>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="9.3.001" expanded="true" height="68" name="Transform Cases" width="90" x="45" y="34">
<parameter key="transform_to" value="lower case"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize" width="90" x="179" y="34">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
</operator>
<operator activated="true" class="text:filter_stopwords_english" compatibility="9.3.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="313" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="9.3.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="447" y="34">
<parameter key="min_chars" value="4"/>
<parameter key="max_chars" value="20"/>
</operator>
<operator activated="true" class="text:stem_porter" compatibility="9.3.001" expanded="true" height="68" name="Stem (Porter)" width="90" x="581" y="34"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="operator_toolbox:lda_exampleset" compatibility="2.4.000" expanded="true" height="124" name="Extract Topics from Data (LDA)" width="90" x="380" y="136">
<parameter key="text_attribute" value="text"/>
<parameter key="number_of_topics" value="10"/>
<parameter key="use_alpha_heuristics" value="true"/>
<parameter key="alpha_sum" value="0.1"/>
<parameter key="use_beta_heuristics" value="true"/>
<parameter key="beta" value="0.01"/>
<parameter key="optimize_hyperparameters" value="true"/>
<parameter key="optimize_interval_for_hyperparameters" value="10"/>
<parameter key="top_words_per_topic" value="5"/>
<parameter key="iterations" value="1000"/>
<parameter key="reproducible" value="false"/>
<parameter key="enable_logging" value="false"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Extract Topics from Data (LDA)" to_port="exa"/>
<connect from_op="Process Documents from Files" from_port="word list" to_port="result 1"/>
<connect from_op="Extract Topics from Data (LDA)" from_port="exa" to_port="result 2"/>
<connect from_op="Extract Topics from Data (LDA)" from_port="top" to_port="result 3"/>
<connect from_op="Extract Topics from Data (LDA)" from_port="mod" to_port="result 4"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0
Answers
@mschmitz should be able to shed some light.
In the meantime what happens if you store the data as a new exampleset after text data processing and then retrieve it as a fresh set before you run the LDA extract topics? That should cut off its access to the various stopwords and short tokens.
Lindon Ventures
Data Science Consulting from Certified RapidMiner Experts
Dortmund, Germany