Cleaning twitter data

ilze · May 2017

I'm new to RapidMiner, and I am struggling to understand how the Filter commands can be used to clean up twitter feeds. I am importing these from a CSV file and am trying to create sub-processes within the process documents operator to remove twitter handles (@), RT and hashtags. I have tried for example to use Filter Tokens by Content specifying that the condition is contains the string @. Although the process runs without errors I cannot see in the results that the twitter handles were removed. Can anybody please advise on how to go about cleaning up the data?

Thomas_Ott · May 2017

When you load in the tweets from CSV they will come in as a Nominal datatype. To use the Filter Tokens by Content, you would need to convert those tweets into a Text data type via a Nominal to Text operator.

Here's a sample using the Search Twitter operator that does some cleaning.

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
  <context>
    <input/>
    <output/>
    <macros>
      <macro>
        <key>keywords</key>
        <value>Donald Trump</value>
      </macro>
    </macros>
  </context>
  <operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="social_media:search_twitter" compatibility="7.3.000" expanded="true" height="68" name="Search Twitter" width="90" x="45" y="34">
        <parameter key="connection" value="ThomasOtt"/>
        <parameter key="query" value="%{keywords}"/>
        <parameter key="limit" value="1000"/>
        <parameter key="language" value="en"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="Text|Id|Retweet-Count"/>
      </operator>
      <operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="Replace" width="90" x="313" y="34">
        <parameter key="replace_what" value="#(.*)"/>
        <parameter key="replace_by" value="hashtag_$1"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="7.5.000" expanded="true" height="82" name="Nominal to Text" width="90" x="447" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="Text"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="34">
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prune_above_percent" value="50.0"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
          <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
          <operator activated="true" class="text:filter_by_length" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="313" y="34"/>
          <operator activated="true" class="text:replace_tokens" compatibility="7.4.001" expanded="true" height="68" name="Replace Tokens" width="90" x="447" y="34">
            <list key="replace_dictionary">
              <parameter key="https" value="link"/>
              <parameter key="http" value="link"/>
            </list>
          </operator>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.4.001" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="581" y="34"/>
          <operator activated="true" class="text:filter_tokens_by_content" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Content)" width="90" x="715" y="34">
            <parameter key="string" value="link"/>
            <parameter key="invert condition" value="true"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="7.4.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="849" y="34"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Replace Tokens" to_port="document"/>
          <connect from_op="Replace Tokens" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
          <connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
          <connect from_op="Filter Tokens (by Content)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:wordlist_to_data" compatibility="7.4.001" expanded="true" height="82" name="WordList to Data" width="90" x="715" y="85"/>
      <operator activated="true" class="sort" compatibility="7.5.000" expanded="true" height="82" name="Sort" width="90" x="849" y="85">
        <parameter key="attribute_name" value="total"/>
        <parameter key="sorting_direction" value="decreasing"/>
      </operator>
      <operator activated="true" class="write_excel" compatibility="7.5.000" expanded="true" height="82" name="Write Excel" width="90" x="983" y="85">
        <parameter key="excel_file" value="C:\Users\ThomasOtt\Desktop\Important Twitter Words for %{keywords}.xlsx"/>
        <parameter key="encoding" value="SYSTEM"/>
      </operator>
      <connect from_op="Search Twitter" from_port="output" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Replace" to_port="example set input"/>
      <connect from_op="Replace" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
      <connect from_op="WordList to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
      <connect from_op="Sort" from_port="example set output" to_op="Write Excel" to_port="input"/>
      <connect from_op="Write Excel" from_port="through" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

Cleaning twitter data

Answers