Detecting languages in a dataset of texts

s242936 · September 2018

Hi to everybody!

I'm Marco and I'm an engineering student.

I thank everyone in advance for the work of this community. Under the council of @sgenzer , I'm opening this thread to allow people who will have the same problem in the future to have a correct answer.

I have to analyze a collection of .txt documents (about 4000) in differents languages, after a rapid cleaning of dataset (to delete records with the same content) i was going to detect languages to organize better my analysis:

-first i used rosette tool and it works partially,

-secondly i used aylien tool and it works well... BUT

there is the problem of limited (1000 elements/day) or expensive Api key that these tools need...

So i try to use the method of this thread https://community.rapidminer.com/t5/RapidMiner-Text-Analytics-Web/Detecting-written-text-language-in-text-mining-using/ta-p/44319 but probably because i have to study better rapidminer it doesn't work.

Here my actual small process (i used a sample only to be faster but after everything will work i need to analyze all the collection):

Here the "results":

Esempio risultato.PNG

Here the errors:

Esempio risultato2.PNG

Here my XML:

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.2.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="text:process_document_from_file" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="45" y="34">
        <list key="text_directories">
          <parameter key="Articles" value="C:\Users\MMA\Desktop\PubMed"/>
        </list>
        <parameter key="encoding" value="UTF-8"/>
        <parameter key="create_word_vector" value="false"/>
        <parameter key="keep_text" value="true"/>
        <process expanded="true">
          <connect from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="remove_duplicates" compatibility="8.2.001" expanded="true" height="103" name="Remove Duplicates" width="90" x="246" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="text"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="false" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="179" y="391">
        <parameter key="select_attributes_and_weights" value="true"/>
        <list key="specify_weights">
          <parameter key="text" value="1.0"/>
        </list>
      </operator>
      <operator activated="false" class="text:process_documents" compatibility="8.1.000" expanded="true" height="82" name="Process Documents" width="90" x="380" y="391">
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="34"/>
          <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="34"/>
          <operator activated="true" class="text:stem_snowball" compatibility="8.1.000" expanded="true" height="68" name="Stem (Snowball)" width="90" x="447" y="34"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="648" y="34"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="false" class="filter_examples" compatibility="8.2.001" expanded="true" height="103" name="Filter Examples" width="90" x="514" y="391">
        <list key="filters_list">
          <parameter key="filters_entry_key" value="language.eq.en"/>
        </list>
      </operator>
      <operator activated="true" class="sample" compatibility="8.2.001" expanded="true" height="82" name="Sample" width="90" x="380" y="34">
        <parameter key="sample" value="relative"/>
        <parameter key="sample_ratio" value="0.05"/>
        <list key="sample_size_per_class"/>
        <list key="sample_ratio_per_class"/>
        <list key="sample_probability_per_class"/>
      </operator>
      <operator activated="true" class="web:encode_urls" compatibility="7.3.000" expanded="true" height="82" name="Encode URLs" width="90" x="514" y="34">
        <parameter key="url_attribute" value="text"/>
        <parameter key="encoding" value="UTF-8"/>
      </operator>
      <operator activated="true" class="web:enrich_data_by_webservice" compatibility="7.3.000" expanded="true" height="68" name="Enrich Data by Webservice" width="90" x="648" y="34">
        <parameter key="query_type" value="JsonPath"/>
        <list key="string_machting_queries"/>
        <list key="regular_expression_queries"/>
        <list key="regular_region_queries"/>
        <list key="xpath_queries"/>
        <list key="namespaces"/>
        <list key="index_queries"/>
        <list key="jsonpath_queries">
          <parameter key="language" value="$..language"/>
          <parameter key="isReliable" value="$..isReliable"/>
          <parameter key="confidence" value="$..confidence"/>
        </list>
        <parameter key="request_method" value="POST"/>
        <parameter key="body" value="&lt;%text%&gt;"/>
        <parameter key="url" value="https://ws.detectlanguage.com/0.2/detect?q=&amp;lt;%text%&amp;gt;&amp;amp;key=*********************************"/>
        <list key="request_properties"/>
        <parameter key="encoding" value="UTF-8"/>
      </operator>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Remove Duplicates" to_port="example set input"/>
      <connect from_op="Remove Duplicates" from_port="example set output" to_op="Sample" to_port="example set input"/>
      <connect from_op="Sample" from_port="example set output" to_op="Encode URLs" to_port="example set input"/>
      <connect from_op="Encode URLs" from_port="example set output" to_op="Enrich Data by Webservice" to_port="Example Set"/>
      <connect from_op="Enrich Data by Webservice" from_port="ExampleSet" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

PS: i wrote ****************** on purpose in the line of apikey to don't show it so this isn't an error.

Probably I'm missing something of trivial but I trust in your help.

Thank you all again,

Marco.

lionelderkrikor · September 2018

@s242936,

good new :

The results you obtained with the first method (Enrich Data by Webservice operator) were weird.

In deed, the Process Documents from Files operator don't produce the 'text' attribute (although the parameter keep text is checked) and thus the request to the webservice was empty and in fine the returned results were empty too and displayed as '?' by RapidMiner...

Note : You can see in Marco's second screenshot (first post) that there isn't text attribute in the resulting exampleset...

The method here is to use a Set Role operator to "reveal" this text attribute (set as "regular" in the operator) and it works :

The process :

<?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.0.002" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="text:process_document_from_file" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="112" y="34">
        <list key="text_directories">
          <parameter key="test" value="C:\Users\Lionel\Documents\Formations_DataScience\Rapidminer\Tests_Rapidminer\Detect_language"/>
        </list>
        <parameter key="file_pattern" value="*.txt"/>
        <parameter key="create_word_vector" value="false"/>
        <parameter key="keep_text" value="true"/>
        <process expanded="true">
          <connect from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="set_role" compatibility="9.0.002" expanded="true" height="82" name="Set Role" width="90" x="313" y="34">
        <parameter key="attribute_name" value="text"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="web:encode_urls" compatibility="7.3.000" expanded="true" height="82" name="Encode URLs" width="90" x="581" y="34">
        <parameter key="url_attribute" value="text"/>
        <parameter key="encoding" value="UTF-8"/>
      </operator>
      <operator activated="true" class="web:enrich_data_by_webservice" compatibility="7.3.000" expanded="true" height="68" name="Enrich Data by Webservice" width="90" x="715" y="34">
        <parameter key="query_type" value="JsonPath"/>
        <list key="string_machting_queries"/>
        <list key="regular_expression_queries">
          <parameter key="foo" value=".*"/>
        </list>
        <list key="regular_region_queries"/>
        <list key="xpath_queries"/>
        <list key="namespaces"/>
        <list key="index_queries"/>
        <list key="jsonpath_queries">
          <parameter key="language" value="$..language"/>
          <parameter key="isReliable" value="$..isReliable"/>
          <parameter key="confidence" value="$..confidence"/>
        </list>
        <parameter key="url" value="http://ws.detectlanguage.com/0.2/detect?q=&amp;lt;%text%&amp;gt;&amp;amp;key=xxxxxxxxx"/>
        <parameter key="delay" value="10"/>
        <list key="request_properties"/>
      </operator>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Encode URLs" to_port="example set input"/>
      <connect from_op="Encode URLs" from_port="example set output" to_op="Enrich Data by Webservice" to_port="Example Set"/>
      <connect from_op="Enrich Data by Webservice" from_port="ExampleSet" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Regards,

Lionel

lionelderkrikor · September 2018

Hi @s242936,

I propose an alternative solution with a very simple Python script using the "textblob" library.

The results are like that :

To execute this process, you have to :

- install Python on your computer

- install the textblob library on your computer (pip install textblob)

- Set the name of your text attribute with quotes in the Set Macros operator :

I hope it helps,

Regards,

Lionel

NB : Language detection is powered by the Google Translate API.

s242936 · September 2018

Hi, Lionel,

thanks to your answer, could you show your process or sent me a private message where you give me more istruction?

I'm full of work and I'm in panic phase!

Thanks for your help!

lionelderkrikor · September 2018

Hi @s242936

In deed , I forget to share the process in my last post.

Here it is :

<?xml version="1.0" encoding="UTF-8"?><process version="9.0.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.0.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="operator_toolbox:create_exampleset" compatibility="1.3.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="85">
        <parameter key="generator_type" value="comma_separated_text"/>
        <list key="function_descriptions"/>
        <list key="numeric_series_configuration"/>
        <list key="date_series_configuration"/>
        <list key="date_series_configuration (interval)"/>
        <parameter key="input_csv_text" value="Id,Text&#10;1,My taylor is rich&#10;2,Mon tailleur est riche&#10;3,Mein Schneider ist reich&#10;4,Mi sastre es rico"/>
      </operator>
      <operator activated="true" class="set_macros" compatibility="9.0.001" expanded="true" height="82" name="Set Macros" width="90" x="179" y="85">
        <list key="macros">
          <parameter key="textAttribute" value="'Text'"/>
        </list>
      </operator>
      <operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="82" name="Execute Python" width="90" x="313" y="85">
        <parameter key="script" value="import pandas as pd&#10;from textblob import TextBlob&#10;&#10;textAtt = %{textAttribute}&#10;&#10;# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;&#10;def det_language(text) : &#10;&#10;  b = TextBlob(text)&#10;  language = b.detect_language()&#10;  return language&#10;&#10;&#10;def rm_main(data):&#10;&#10;  data['language'] = data[textAtt].apply(det_language)&#10;  &#10;    # connect 2 output ports to see the results&#10;  return data"/>
      </operator>
      <connect from_op="Create ExampleSet" from_port="output" to_op="Set Macros" to_port="through 1"/>
      <connect from_op="Set Macros" from_port="through 1" to_op="Execute Python" to_port="input 1"/>
      <connect from_op="Execute Python" from_port="output 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Regards,

Lionel

s242936 · September 2018

lionelderkrikor ,

thanks to your help but my process didn't work.

I don't understand, what's wrong?

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.2.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="false" class="operator_toolbox:create_exampleset" compatibility="1.4.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="340">
        <parameter key="generator_type" value="comma_separated_text"/>
        <list key="function_descriptions"/>
        <list key="numeric_series_configuration"/>
        <list key="date_series_configuration"/>
        <list key="date_series_configuration (interval)"/>
        <parameter key="input_csv_text" value="Id,Text&#10;1,My taylor is rich&#10;2,Mon tailleur est riche&#10;3,Mein Schneider ist reich&#10;4,Mi sastre es rico"/>
      </operator>
      <operator activated="true" class="text:process_document_from_file" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="45" y="85">
        <list key="text_directories">
          <parameter key="Articoli" value="C:\Users\MMA\Desktop\PubMed"/>
        </list>
        <parameter key="keep_text" value="true"/>
        <process expanded="true">
          <connect from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="sample" compatibility="8.2.001" expanded="true" height="82" name="Sample" width="90" x="179" y="85">
        <parameter key="sample" value="relative"/>
        <parameter key="sample_ratio" value="0.025"/>
        <list key="sample_size_per_class"/>
        <list key="sample_ratio_per_class"/>
        <list key="sample_probability_per_class"/>
      </operator>
      <operator activated="true" class="set_macros" compatibility="8.2.001" expanded="true" height="82" name="Set Macros" width="90" x="313" y="85">
        <list key="macros">
          <parameter key="textAttribute" value="'text'"/>
        </list>
      </operator>
      <operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="82" name="Execute Python" width="90" x="447" y="85">
        <parameter key="script" value="import pandas as pd&#10;from textblob import TextBlob&#10;&#10;textAtt = %{textAttribute}&#10;&#10;# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;&#10;def det_language(text) : &#10;&#10;  b = TextBlob(text)&#10;  language = b.detect_language()&#10;  return language&#10;&#10;&#10;def rm_main(data):&#10;&#10;  data['language'] = data[textAtt].apply(det_language)&#10;  &#10;    # connect 2 output ports to see the results&#10;  return data"/>
      </operator>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Sample" to_port="example set input"/>
      <connect from_op="Sample" from_port="example set output" to_op="Set Macros" to_port="through 1"/>
      <connect from_op="Set Macros" from_port="through 1" to_op="Execute Python" to_port="input 1"/>
      <connect from_op="Execute Python" from_port="output 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

I made all steps you indicated in your post.

Thank you for your patience!

lionelderkrikor · September 2018

Hi @s242936,

I have a good new and a bad new :

- the good new is that I think I fixed your problem (no data produced) : I think you have to update your file pattern (*.txt) in the

Process Documents from Files

- the bad new is that the process don't detect the language (after some tests with my own text files) and after researches I don't know why... :

So I propose that you test this new process with your own text files and if it doesn't work, tell me, I will try an other Python library (NLTK for example...)

Regards,

Lionel

NB : The process:

<?xml version="1.0" encoding="UTF-8"?><process version="9.0.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.0.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="false" class="operator_toolbox:create_exampleset" compatibility="1.3.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="340">
        <parameter key="generator_type" value="comma_separated_text"/>
        <list key="function_descriptions"/>
        <list key="numeric_series_configuration"/>
        <list key="date_series_configuration"/>
        <list key="date_series_configuration (interval)"/>
        <parameter key="input_csv_text" value="Id,Text&#10;1,My taylor is rich&#10;2,Mon tailleur est riche&#10;3,Mein Schneider ist reich&#10;4,Mi sastre es rico"/>
      </operator>
      <operator activated="true" class="text:process_document_from_file" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="45" y="85">
        <list key="text_directories">
          <parameter key="Articoli" value="C:\Users\Lionel\Documents\Formations_DataScience\Rapidminer\DataMiningForTheMasses"/>
        </list>
        <parameter key="file_pattern" value="*.txt"/>
        <parameter key="create_word_vector" value="false"/>
        <parameter key="add_meta_information" value="false"/>
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prune_above_percent" value="10.0"/>
        <process expanded="true">
          <connect from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="sample" compatibility="9.0.001" expanded="true" height="82" name="Sample" width="90" x="179" y="85">
        <parameter key="sample" value="relative"/>
        <parameter key="sample_ratio" value="0.5"/>
        <list key="sample_size_per_class"/>
        <list key="sample_ratio_per_class"/>
        <list key="sample_probability_per_class"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="9.0.001" expanded="true" height="82" name="Set Role" width="90" x="313" y="85">
        <parameter key="attribute_name" value="text"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="set_macros" compatibility="9.0.001" expanded="true" height="82" name="Set Macros" width="90" x="447" y="85">
        <list key="macros">
          <parameter key="textAttribute" value="'text'"/>
        </list>
      </operator>
      <operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="82" name="Execute Python" width="90" x="581" y="85">
        <parameter key="script" value="import pandas as pd&#10;from textblob import TextBlob&#10;&#10;textAtt = %{textAttribute}&#10;&#10;# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;&#10;def det_language(text) : &#10;&#10;  b = TextBlob(text)&#10;  language = b.detect_language()&#10;  return language&#10;&#10;&#10;def rm_main(data):&#10;&#10;  data['language'] = data[textAtt].apply(det_language)&#10;  &#10;    # connect 2 output ports to see the results&#10; &#10;  return data"/>
      </operator>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Sample" to_port="example set input"/>
      <connect from_op="Sample" from_port="example set output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Set Macros" to_port="through 1"/>
      <connect from_op="Set Macros" from_port="through 1" to_op="Execute Python" to_port="input 1"/>
      <connect from_op="Execute Python" from_port="output 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

s242936 · September 2018

Hi @lionelderkrikor,

thank you for the time you're spending to help me.

Unfortunately langueges were not detected applying your last process but the output is different than yours, that's a screen of my output with warnings (reading these warnigs i think that the script detect languages but it isn't able to write):

What do you think?

Greetings

lionelderkrikor · September 2018

Hi again @s242936,

Can you share some of your .txt files in order I can reproduce this new behaviour ?

Regards,

Lionel

s242936 · September 2018

Thank you a lot!

sgenzer · September 2018

hi @s242936 - sorry I went dark there. So actually there is a very small but easy fix to the original process. You just need to add a "Set Role" operator after your Process Documents. You have your "text" attribute labeled as a special attribute which is not being taken by Enrich Data. So just add a Set Role and set the "text" attribute to "Regular" and you should be all set. I just tested it and all is good. See process below and data sets attached.

[EDIT - Duh I could have just looked at @lionelderkrikor's solution which does the same thing!]

<?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.0.002" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="text:process_document_from_file" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="45" y="34">
        <list key="text_directories">
          <parameter key="Articles" value="/Users/genzerconsulting/OneDrive - RapidMiner/Foreign Lang test docs"/>
        </list>
        <parameter key="encoding" value="UTF-8"/>
        <parameter key="create_word_vector" value="false"/>
        <parameter key="keep_text" value="true"/>
        <process expanded="true">
          <connect from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="false" class="remove_duplicates" compatibility="9.0.002" expanded="true" height="103" name="Remove Duplicates" width="90" x="246" y="187">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="text"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="false" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="179" y="391">
        <parameter key="select_attributes_and_weights" value="true"/>
        <list key="specify_weights">
          <parameter key="text" value="1.0"/>
        </list>
      </operator>
      <operator activated="false" class="text:process_documents" compatibility="8.1.000" expanded="true" height="82" name="Process Documents" width="90" x="380" y="391">
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="34"/>
          <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="34"/>
          <operator activated="true" class="text:stem_snowball" compatibility="8.1.000" expanded="true" height="68" name="Stem (Snowball)" width="90" x="447" y="34"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="648" y="34"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="false" class="filter_examples" compatibility="9.0.002" expanded="true" height="103" name="Filter Examples" width="90" x="514" y="391">
        <list key="filters_list">
          <parameter key="filters_entry_key" value="language.eq.en"/>
        </list>
      </operator>
      <operator activated="false" class="sample" compatibility="9.0.002" expanded="true" height="82" name="Sample" width="90" x="380" y="187">
        <parameter key="sample" value="relative"/>
        <parameter key="sample_ratio" value="0.05"/>
        <list key="sample_size_per_class"/>
        <list key="sample_ratio_per_class"/>
        <list key="sample_probability_per_class"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="9.0.002" expanded="true" height="82" name="Set Role" width="90" x="246" y="34">
        <parameter key="attribute_name" value="text"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="web:encode_urls" compatibility="7.3.000" expanded="true" height="82" name="Encode URLs" width="90" x="514" y="34">
        <parameter key="url_attribute" value="text"/>
        <parameter key="encoding" value="UTF-8"/>
      </operator>
      <operator activated="true" class="web:enrich_data_by_webservice" compatibility="7.3.000" expanded="true" height="68" name="Enrich Data by Webservice (2)" width="90" x="648" y="34">
        <parameter key="query_type" value="JsonPath"/>
        <list key="string_machting_queries"/>
        <list key="regular_expression_queries"/>
        <list key="regular_region_queries"/>
        <list key="xpath_queries"/>
        <list key="namespaces"/>
        <list key="index_queries"/>
        <list key="jsonpath_queries">
          <parameter key="language" value="$..language"/>
          <parameter key="isReliable" value="$..isReliable"/>
          <parameter key="confidence" value="$..confidence"/>
        </list>
        <parameter key="request_method" value="POST"/>
        <parameter key="body" value="&lt;%text%&gt;"/>
        <parameter key="url" value="https://ws.detectlanguage.com/0.2/detect?q=&amp;lt;%text%&amp;gt;&amp;amp;key=foo"/>
        <parameter key="delay" value="100"/>
        <list key="request_properties"/>
        <parameter key="encoding" value="UTF-8"/>
      </operator>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Encode URLs" to_port="example set input"/>
      <connect from_op="Encode URLs" from_port="example set output" to_op="Enrich Data by Webservice (2)" to_port="Example Set"/>
      <connect from_op="Enrich Data by Webservice (2)" from_port="ExampleSet" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Screen Shot 2018-09-11 at 12.34.47 PM.png
Scott

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

Detecting languages in a dataset of texts

Best Answer

Answers