The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
Removing mentions with "@" and emojis from Excel Data
Hello RapidMiner Community,
I am currently working on a supervised sentiment analysis. I had success doing the sentiment analysis itself, but I'm not quiet happy with the data it uses.
As part of the data preparation, I wand to remove mentions (thus names following an "@" ) and I have tried out some suggestions. The process I have generated so far is uploaded here as well as the test data.
I am working with the "replace" operator but sadly, following this process, the outcome still incorporate some mentions. These mentions are still there because either a) they are the second mention in one row or b) they mention is not right at the beginning of the row.
Do any of you guys have some input regarding this?
In general, the goals I am trying to achieve are:
-remove any word (not the whole row) starting with "@".
-remove empty rows
-remove duplicates
-remove emojis (right now, with this process I ended up with question marks instead of the emojis as output, so I'd rather remove the emojis right away)
Grateful for any suggestions!
Anna May
I am currently working on a supervised sentiment analysis. I had success doing the sentiment analysis itself, but I'm not quiet happy with the data it uses.
As part of the data preparation, I wand to remove mentions (thus names following an "@" ) and I have tried out some suggestions. The process I have generated so far is uploaded here as well as the test data.
I am working with the "replace" operator but sadly, following this process, the outcome still incorporate some mentions. These mentions are still there because either a) they are the second mention in one row or b) they mention is not right at the beginning of the row.
Do any of you guys have some input regarding this?
In general, the goals I am trying to achieve are:
-remove any word (not the whole row) starting with "@".
-remove empty rows
-remove duplicates
-remove emojis (right now, with this process I ended up with question marks instead of the emojis as output, so I'd rather remove the emojis right away)
Grateful for any suggestions!
Anna May
<operator activated="true" class="process" compatibility="9.8.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="9.8.000" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
<parameter key="excel_file" value="C:\Users\MartinSchmitz\Downloads\Test Comments 1.xlsx"/>
<parameter key="sheet_selection" value="sheet number"/>
<parameter key="sheet_number" value="1"/>
<parameter key="imported_cell_range" value="A1"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="true"/>
<list key="annotations"/>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="read_all_values_as_polynominal" value="false"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="Comments.true.polynominal.attribute"/>
<parameter key="read_not_matching_values_as_missings" value="false"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<operator activated="true" class="replace" compatibility="9.8.000" expanded="true" height="82" name="Replace" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="replace_what" value="[^\x00-\x7F]"/>
<description align="center" color="transparent" colored="false" width="126">Replace all non-ascii letters</description>
<operator activated="true" class="replace" compatibility="9.8.000" expanded="true" height="82" name="Replace (2)" width="90" x="313" y="34">
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="replace_what" value="@/>
<description align="center" color="transparent" colored="false" width="126">Replace @</description>
<connect from_op="Read Excel" from_port="output" to_op="Replace" to_port="example set input"/>
<connect from_op="Replace" from_port="example set output" to_op="Replace (2)" to_port="example set input"/>
<connect from_op="Replace (2)" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
Dortmund, Germany
thanks for the quick reply
Do you have any input as to why this might be the case?
Anna May
<operator activated="true" class="process" compatibility="9.8.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="9.8.000" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
<parameter key="excel_file" value="C:\Users\MartinSchmitz\Downloads\Test Comments 1.xlsx"/>
<parameter key="sheet_selection" value="sheet number"/>
<parameter key="sheet_number" value="1"/>
<parameter key="imported_cell_range" value="A1"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="true"/>
<list key="annotations"/>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="read_all_values_as_polynominal" value="false"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="Comments.true.polynominal.attribute"/>
<parameter key="read_not_matching_values_as_missings" value="false"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<operator activated="true" class="replace" compatibility="9.8.000" expanded="true" height="82" name="Replace" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="replace_what" value="[^\x00-\x7F]"/>
<description align="center" color="transparent" colored="false" width="126">Replace all non-ascii letters</description>
<operator activated="true" class="replace" compatibility="9.8.000" expanded="true" height="82" name="Replace (2)" width="90" x="313" y="34">
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="replace_what" value="@(\w+)"/>
<description align="center" color="transparent" colored="false" width="126">Replace @</description>
<connect from_op="Read Excel" from_port="output" to_op="Replace" to_port="example set input"/>
<connect from_op="Replace" from_port="example set output" to_op="Replace (2)" to_port="example set input"/>
<connect from_op="Replace (2)" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
Dortmund, Germany
thanks again for your time! I have no idea why but this still doesn't work for me. Would you mind sharing your process in another format?
Anna May
Dortmund, Germany
Dortmund, Germany