The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here

I have problem removing url and hashtags in the data(from excel)

fangirl96fangirl96 Member Posts: 2 Contributor I
edited November 2018 in Help
I’m having a problem in removing url and hashtags in the data(from excel). I have inputted data(tweets) using 3 read excel then append them. After that, I connected the append operator to replace then inputted regex for url and hashtags in parameters named regular expression and replace what. Then, I connected it to data to document then process documents where I have Transform cases, Tokenize and Filter Stopwords(dictionary) respectively. The results were tokenized and the stopwords I created were removed. But the one with hashtags, only the # symbol is removed. For example, original text is #vscocam the result is vscocam while the url it is not removed. It was just tokenized too.
Tagged:

Answers

  • sgenzersgenzer Administrator, Moderator, Employee-RapidMiner, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager

    hello @fangirl96 - welcome to the community.  I think I understand and believe you just need to adjust your regex.  Can you give some examples and the process you're using (see instructions "Read Before Posting" on the right).


    Scott

     

  • fangirl96fangirl96 Member Posts: 2 Contributor I

    This is the full xml of my process.

    <?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.5.003" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="read_excel" compatibility="7.5.003" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
    <parameter key="excel_file" value="C:\Users\ace\Desktop\Airasia1 total.xlsx"/>
    <parameter key="imported_cell_range" value="A1:A14"/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="Text.true.text.attribute"/>
    </list>
    </operator>
    <operator activated="true" class="read_excel" compatibility="7.5.003" expanded="true" height="68" name="Read Excel (3)" width="90" x="45" y="136">
    <parameter key="excel_file" value="C:\Users\ace\Dropbox\Thesis V3.0\Thesis 2 - data gathering (testing 3) with additional\Negative\neg_airasia.xlsx"/>
    <parameter key="imported_cell_range" value="A1:A184"/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="Text.true.text.attribute"/>
    </list>
    </operator>
    <operator activated="true" class="read_excel" compatibility="7.5.003" expanded="true" height="68" name="Read Excel (4)" width="90" x="45" y="238">
    <parameter key="excel_file" value="C:\Users\ace\Dropbox\Thesis V3.0\Thesis 2 - data gathering (testing 3) with additional\Negative\neg_cebupac.xlsx"/>
    <parameter key="imported_cell_range" value="A1:A53"/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="Text.true.text.attribute"/>
    </list>
    </operator>
    <operator activated="true" class="append" compatibility="7.5.003" expanded="true" height="124" name="Append" width="90" x="179" y="136"/>
    <operator activated="true" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="313" y="34">
    <list key="specify_weights"/>
    </operator>
    <operator activated="true" class="text:process_documents" compatibility="7.5.000" expanded="true" height="103" name="Process Documents" width="90" x="447" y="34">
    <process expanded="true">
    <operator activated="true" breakpoints="before,after" class="text:replace_tokens" compatibility="7.5.000" expanded="true" height="68" name="Replace Tokens" width="90" x="112" y="34">
    <list key="replace_dictionary">
    <parameter key="@[a-zA-Z]*" value=" "/>
    <parameter key="#[a-zA-Z0-9]*" value=" "/>
    </list>
    </operator>
    <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="112" y="136"/>
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="238">
    <parameter key="expression" value="\[\d*\][^\[\]]*"/>
    </operator>
    <operator activated="true" class="text:stem_porter" compatibility="7.5.000" expanded="true" height="68" name="Stem (Porter)" width="90" x="246" y="136"/>
    <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="7.5.000" expanded="true" height="82" name="Filter Stopwords (Dictionary)" width="90" x="246" y="238">
    <parameter key="file" value="C:\Users\ace\Dropbox\Thesis V3.0\THESIS 4\airasia.txt"/>
    </operator>
    <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.5.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="380" y="238"/>
    <connect from_port="document" to_op="Replace Tokens" to_port="document"/>
    <connect from_op="Replace Tokens" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Stem (Porter)" to_port="document"/>
    <connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
    <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
    <connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Read Excel" from_port="output" to_op="Append" to_port="example set 1"/>
    <connect from_op="Read Excel (3)" from_port="output" to_op="Append" to_port="example set 2"/>
    <connect from_op="Read Excel (4)" from_port="output" to_op="Append" to_port="example set 3"/>
    <connect from_op="Append" from_port="merged set" to_op="Data to Documents" to_port="example set"/>
    <connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
    <connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
    <connect from_op="Process Documents" from_port="word list" to_port="result 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    </process>
    </operator>
    </process>

     The links are not removed but the hashtags were removed.

    PS. The links included in my data is starting with https

  • sgenzersgenzer Administrator, Moderator, Employee-RapidMiner, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager

    thank you @fangirl96 - can you share one of those excel sheets as well?

     

    Scott

     

  • Thomas_OttThomas_Ott RapidMiner Certified Analyst, RapidMiner Certified Expert, Member Posts: 1,761 Unicorn

    @fangirl96 take a look at my tutorial process here: http://www.neuralmarkettrends.com/blog/entry/use-rapidminer-discover-twitter-content

    I extract hashtags and drop https: to a generic word called 'link'

     

     

Sign In or Register to comment.