The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
Classification of tweets in to 3 different categories
I have 60 twitter datasets stored in different excel files with the fields (Tweets ID, Tweet Text, No of likes, No. of share, No.of Retweet and tweet categories). I have manually trained the 10,000 tweets which is stored in excel file. I want to automatically classify the remaining tweets using RapidMiner but don't know how. I want all the attributes in my output file with category (sports, politics, war) of tweets. Pl explain which operators should be used to classify the text in to categories step by step. I am new to Rapidminer. I watched videos but I am not able to understand anything.
Tagged:
0
Answers
If possible please tell me about the operators to be used to perform such type of analysis step by step. My assignment is due.
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.4.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="-1"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.6.000" expanded="true" height="68" name="Retrieve Airlines Full" width="90" x="45" y="34">
<parameter key="repository_entry" value="Airlines Full"/>
</operator>
<operator activated="true" class="sample" compatibility="9.6.000" expanded="true" height="82" name="Sample" width="90" x="45" y="187">
<parameter key="sample" value="absolute"/>
<parameter key="balance_data" value="false"/>
<parameter key="sample_size" value="1000"/>
<parameter key="sample_ratio" value="0.1"/>
<parameter key="sample_probability" value="0.1"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="9.5.001" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="340">
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="custom_filters"/>
<parameter key="invert_filter" value="false"/>
<list key="filters_list">
<parameter key="filters_entry_key" value="recommended.is_not_missing."/>
</list>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="9.6.000" expanded="true" height="82" name="Select Text Only" width="90" x="179" y="340">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="content|recommended"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="set_role" compatibility="9.6.000" expanded="true" height="82" name="Set Role" width="90" x="179" y="187">
<parameter key="attribute_name" value="recommended"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Dos from Data" width="90" x="179" y="34">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_below_percent" value="1.0"/>
<parameter key="prune_above_percent" value="40.0"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="9999"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="content" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="9.3.001" expanded="true" height="68" name="Transform Cases" width="90" x="112" y="30">
<parameter key="transform_to" value="lower case"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize" width="90" x="246" y="30">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
</operator>
<operator activated="true" class="text:stem_snowball" compatibility="9.3.001" expanded="true" height="68" name="Stem (Snowball)" width="90" x="380" y="30">
<parameter key="language" value="English"/>
</operator>
<operator activated="true" class="text:filter_stopwords_english" compatibility="9.3.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="514" y="30"/>
<operator activated="true" class="text:filter_by_length" compatibility="9.3.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="648" y="30">
<parameter key="min_chars" value="4"/>
<parameter key="max_chars" value="25"/>
</operator>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
<connect from_op="Stem (Snowball)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="weight_by_information_gain" compatibility="9.6.000" expanded="true" height="82" name="Weight by Information Gain" width="90" x="313" y="34">
<parameter key="normalize_weights" value="true"/>
<parameter key="sort_weights" value="true"/>
<parameter key="sort_direction" value="descending"/>
</operator>
<operator activated="true" class="select_by_weights" compatibility="9.6.000" expanded="true" height="103" name="Select by Weights" width="90" x="447" y="34">
<parameter key="weight_relation" value="top k"/>
<parameter key="weight" value="0.1"/>
<parameter key="k" value="30"/>
<parameter key="p" value="0.5"/>
<parameter key="deselect_unknown" value="true"/>
<parameter key="use_absolute_weights" value="true"/>
</operator>
<operator activated="true" class="concurrency:cross_validation" compatibility="9.6.000" expanded="true" height="145" name="Cross Validation" width="90" x="581" y="34">
<parameter key="split_on_batch_attribute" value="false"/>
<parameter key="leave_one_out" value="false"/>
<parameter key="number_of_folds" value="10"/>
<parameter key="sampling_type" value="automatic"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
<parameter key="enable_parallel_execution" value="true"/>
<process expanded="true">
<operator activated="true" class="k_nn" compatibility="9.6.000" expanded="true" height="82" name="k-NN" width="90" x="179" y="34">
<parameter key="k" value="5"/>
<parameter key="weighted_vote" value="true"/>
<parameter key="measure_types" value="MixedMeasures"/>
<parameter key="mixed_measure" value="MixedEuclideanDistance"/>
<parameter key="nominal_measure" value="NominalDistance"/>
<parameter key="numerical_measure" value="EuclideanDistance"/>
<parameter key="divergence" value="GeneralizedIDivergence"/>
<parameter key="kernel_type" value="radial"/>
<parameter key="kernel_gamma" value="1.0"/>
<parameter key="kernel_sigma1" value="1.0"/>
<parameter key="kernel_sigma2" value="0.0"/>
<parameter key="kernel_sigma3" value="2.0"/>
<parameter key="kernel_degree" value="3.0"/>
<parameter key="kernel_shift" value="1.0"/>
<parameter key="kernel_a" value="1.0"/>
<parameter key="kernel_b" value="0.0"/>
</operator>
<connect from_port="training set" to_op="k-NN" to_port="training set"/>
<connect from_op="k-NN" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="9.6.000" expanded="true" height="82" name="Apply Model" width="90" x="112" y="34">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="9.6.000" expanded="true" height="82" name="Performance" width="90" x="246" y="34">
<parameter key="manually_set_positive_class" value="false"/>
<parameter key="main_criterion" value="first"/>
<parameter key="accuracy" value="true"/>
<parameter key="classification_error" value="false"/>
<parameter key="kappa" value="true"/>
<parameter key="AUC (optimistic)" value="false"/>
<parameter key="AUC" value="true"/>
<parameter key="AUC (pessimistic)" value="false"/>
<parameter key="precision" value="false"/>
<parameter key="recall" value="false"/>
<parameter key="lift" value="false"/>
<parameter key="fallout" value="false"/>
<parameter key="f_measure" value="false"/>
<parameter key="false_positive" value="false"/>
<parameter key="false_negative" value="false"/>
<parameter key="true_positive" value="false"/>
<parameter key="true_negative" value="false"/>
<parameter key="sensitivity" value="false"/>
<parameter key="specificity" value="false"/>
<parameter key="youden" value="false"/>
<parameter key="positive_predictive_value" value="false"/>
<parameter key="negative_predictive_value" value="false"/>
<parameter key="psep" value="false"/>
<parameter key="skip_undefined_labels" value="true"/>
<parameter key="use_example_weights" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="performance 1"/>
<connect from_op="Performance" from_port="example set" to_port="test set results"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve Airlines Full" from_port="output" to_op="Sample" to_port="example set input"/>
<connect from_op="Sample" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Select Text Only" to_port="example set input"/>
<connect from_op="Select Text Only" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Process Dos from Data" to_port="example set"/>
<connect from_op="Process Dos from Data" from_port="example set" to_op="Weight by Information Gain" to_port="example set"/>
<connect from_op="Weight by Information Gain" from_port="weights" to_op="Select by Weights" to_port="weights"/>
<connect from_op="Weight by Information Gain" from_port="example set" to_op="Select by Weights" to_port="example set input"/>
<connect from_op="Select by Weights" from_port="example set output" to_op="Cross Validation" to_port="example set"/>
<connect from_op="Cross Validation" from_port="model" to_port="result 1"/>
<connect from_op="Cross Validation" from_port="test result set" to_port="result 2"/>
<connect from_op="Cross Validation" from_port="performance 1" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="21"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
You can find this data set here: https://github.com/quankiquanki/skytrax-reviews-dataset