The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
Write as text operator writing original text
I have a couple of loop files operators to conduct a series of preprocessing steps on a corpus of annual reports documents (including tokenization, stemming, etc.). The last step is to write the preprocessed documents in text files using the "Write as text" operator. However, it writes the original text rather than the tokenized version (seen on the top half of my results view - see attached screenshot).
Thanks for the help!
/Aya
Thanks for the help!
/Aya
<?xml version="1.0" encoding="UTF-8"?><process version="10.0.000"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="10.0.000" expanded="true" name="Process"> <parameter key="logverbosity" value="init"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="never"/> <parameter key="notification_email" value=""/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <process expanded="true"> <operator activated="true" class="concurrency:loop_files" compatibility="10.0.000" expanded="true" height="82" name="Loop Files" width="90" x="45" y="34"> <parameter key="directory" value="/Users/ayari88/Documents/Research/AFA/ROBOT/Kommuners AR"/> <parameter key="filter_type" value="glob"/> <parameter key="filter_by_regex" value=".*\.docx$"/> <parameter key="recursive" value="true"/> <parameter key="skip_inaccessible" value="true"/> <parameter key="enable_macros" value="false"/> <parameter key="macro_for_file_name" value="file_name"/> <parameter key="macro_for_file_type" value="file_type"/> <parameter key="macro_for_folder_name" value="folder_name"/> <parameter key="reuse_results" value="false"/> <parameter key="enable_parallel_execution" value="true"/> <process expanded="true"> <operator activated="true" class="handle_exception" compatibility="10.0.000" expanded="true" height="82" name="Handle Exception" width="90" x="179" y="34"> <parameter key="add_details_to_log" value="true"/> <process expanded="true"> <operator activated="true" class="text:read_document" compatibility="10.0.000" expanded="true" height="68" name="Read Document" width="90" x="112" y="34"> <parameter key="extract_text_only" value="true"/> <parameter key="use_file_extension_as_type" value="true"/> <parameter key="content_type" value="pdf"/> <parameter key="encoding" value="SYSTEM"/> </operator> <connect from_port="in 1" to_op="Read Document" to_port="file"/> <connect from_op="Read Document" from_port="output" to_port="out 1"/> <portSpacing port="source_in 1" spacing="0"/> <portSpacing port="source_in 2" spacing="0"/> <portSpacing port="sink_out 1" spacing="0"/> <portSpacing port="sink_out 2" spacing="0"/> </process> <process expanded="true"> <portSpacing port="source_in 1" spacing="0"/> <portSpacing port="source_in 2" spacing="0"/> <portSpacing port="sink_out 1" spacing="0"/> <portSpacing port="sink_out 2" spacing="0"/> </process> </operator> <connect from_port="file object" to_op="Handle Exception" to_port="in 1"/> <connect from_op="Handle Exception" from_port="out 1" to_port="output 1"/> <portSpacing port="source_file object" spacing="0"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_output 1" spacing="0"/> <portSpacing port="sink_output 2" spacing="0"/> </process> </operator> <operator activated="true" class="loop_collection" compatibility="10.0.000" expanded="true" height="82" name="Loop Collection" width="90" x="179" y="34"> <parameter key="set_iteration_macro" value="false"/> <parameter key="macro_name" value="iteration"/> <parameter key="macro_start_value" value="1"/> <parameter key="unfold" value="false"/> <process expanded="true"> <operator activated="true" class="text:tokenize" compatibility="10.0.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"> <parameter key="mode" value="non letters"/> <parameter key="characters" value=".:"/> <parameter key="language" value="English"/> <parameter key="max_token_length" value="3"/> </operator> <operator activated="true" class="text:transform_cases" compatibility="10.0.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"> <parameter key="transform_to" value="lower case"/> </operator> <operator activated="true" class="text:filter_by_length" compatibility="10.0.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="313" y="34"> <parameter key="min_chars" value="3"/> <parameter key="max_chars" value="30"/> </operator> <operator activated="true" class="open_file" compatibility="10.0.000" expanded="true" height="68" name="Open File" width="90" x="313" y="289"> <parameter key="resource_type" value="file"/> <parameter key="filename" value="/Users/ayari88/Documents/Research/AFA/ROBOT/RapidMiner/Custom_stopwords_ar.csv"/> </operator> <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="10.0.000" expanded="true" height="82" name="Filter Stopwords (Dictionary)" width="90" x="447" y="187"> <parameter key="case_sensitive" value="false"/> <parameter key="encoding" value="UTF-8"/> </operator> <operator activated="true" class="text:stem_snowball" compatibility="10.0.000" expanded="true" height="68" name="Stem (Snowball)" width="90" x="581" y="34"> <parameter key="language" value="Swedish"/> </operator> <connect from_port="single" to_op="Tokenize" to_port="document"/> <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/> <connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/> <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/> <connect from_op="Open File" from_port="file" to_op="Filter Stopwords (Dictionary)" to_port="file"/> <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Stem (Snowball)" to_port="document"/> <connect from_op="Stem (Snowball)" from_port="document" to_port="output 1"/> <portSpacing port="source_single" spacing="0"/> <portSpacing port="sink_output 1" spacing="0"/> <portSpacing port="sink_output 2" spacing="0"/> </process> </operator> <operator activated="true" class="loop_collection" compatibility="10.0.000" expanded="true" height="82" name="Write files" width="90" x="313" y="34"> <parameter key="set_iteration_macro" value="false"/> <parameter key="macro_name" value="iteration"/> <parameter key="macro_start_value" value="1"/> <parameter key="unfold" value="false"/> <process expanded="true"> <operator activated="false" class="text:write_document" compatibility="10.0.000" expanded="true" height="82" name="Write Document" width="90" x="112" y="238"> <parameter key="file" value="/Users/ayari88/Documents/Research/AFA/ROBOT/Kommuners AR preprocessed/%{a}.txt"/> <parameter key="overwrite" value="true"/> <parameter key="encoding" value="SYSTEM"/> </operator> <operator activated="true" class="write_as_text" compatibility="10.0.000" expanded="true" height="82" name="Write as Text" width="90" x="380" y="34"> <parameter key="result_file" value="/Users/ayari88/Documents/Research/AFA/ROBOT/Kommuners AR preprocessed/%{a}.txt"/> <parameter key="encoding" value="SYSTEM"/> </operator> <connect from_port="single" to_op="Write as Text" to_port="input 1"/> <connect from_op="Write as Text" from_port="input 1" to_port="output 1"/> <portSpacing port="source_single" spacing="0"/> <portSpacing port="sink_output 1" spacing="0"/> <portSpacing port="sink_output 2" spacing="0"/> </process> </operator> <connect from_op="Loop Files" from_port="output 1" to_op="Loop Collection" to_port="collection"/> <connect from_op="Loop Collection" from_port="output 1" to_op="Write files" to_port="collection"/> <connect from_op="Write files" from_port="output 1" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator> </process>
Tagged:
0
Best Answer
-
jwpfau Employee-RapidMiner, Member Posts: 303 RM EngineeringHi,
you can try to use a combination of "Documents to Data" with use processed data checked → Data to Documents → Loop Collection with Write Document to store the tokenized version.
Greetings,
Jonas0
Answers
/Aya