The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
processing files and missing examples in exampleSet
Hi there,
i don't know how to help myself any other way, so i hope some of you can help me.
it's just a small question that kind of gives me a headache...i am doing a sentiment analysis with a bunge of .pdf files - 520 to be certain.
i am processing them with the process documents from files operator and furthermore i am using the operators tokenize (non letters), filter tokens (by lengh), filter stopwords and transform cases.
now instead of getting as many examples in my exampleset as processed documents (which would be 520), i only receive 439 examples.
anyone who can explain that to me?
thanks in advance!
i don't know how to help myself any other way, so i hope some of you can help me.
it's just a small question that kind of gives me a headache...i am doing a sentiment analysis with a bunge of .pdf files - 520 to be certain.
i am processing them with the process documents from files operator and furthermore i am using the operators tokenize (non letters), filter tokens (by lengh), filter stopwords and transform cases.
now instead of getting as many examples in my exampleset as processed documents (which would be 520), i only receive 439 examples.
anyone who can explain that to me?
thanks in advance!
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.5.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.5.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="6.5.000" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
<list key="text_directories">
<parameter key="Codec0" value="/Volumes/RapidMiner Studio/BA/Daten Kopie/PDF/pdf_10Proz_2001_Codec/0"/>
<parameter key="Codec1" value="/Volumes/RapidMiner Studio/BA/Daten Kopie/PDF/pdf_10Proz_2001_Codec/1"/>
<parameter key="Codec2" value="/Volumes/RapidMiner Studio/BA/Daten Kopie/PDF/pdf_10Proz_2001_Codec/2"/>
<parameter key="Codec3" value="/Volumes/RapidMiner Studio/BA/Daten Kopie/PDF/pdf_10Proz_2001_Codec/3"/>
<parameter key="Codec4" value="/Volumes/RapidMiner Studio/BA/Daten Kopie/PDF/pdf_10Proz_2001_Codec/4"/>
</list>
<parameter key="content_type" value="pdf"/>
<parameter key="create_word_vector" value="false"/>
<parameter key="vector_creation" value="Binary Term Occurrences"/>
<parameter key="keep_text" value="true"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="6.5.000" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
<operator activated="true" class="text:filter_by_length" compatibility="6.5.000" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="179" y="30"/>
<operator activated="true" class="text:filter_stopwords_german" compatibility="6.5.000" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="313" y="30">
<parameter key="stop_word_list" value="Sentiment"/>
</operator>
<operator activated="true" class="text:stem_german" compatibility="6.5.000" expanded="true" height="60" name="Stem (German)" width="90" x="447" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="6.5.000" expanded="true" height="60" name="Transform Cases" width="90" x="581" y="30"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
<connect from_op="Filter Stopwords (German)" from_port="document" to_op="Stem (German)" to_port="document"/>
<connect from_op="Stem (German)" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
0
Answers