compare and analysis text documents

TobiasNehrig · December 2017

Hi Experts,

I‘m experimenting in text mining and analysis. I’ve created a neighborhood co-occurence from one text and try to analysis and compare it with a larger corpus.

My Example Set look like:

Row No. | Document | Word1 | Word2 | n

1 aaa bbb 2

1 bbb ddd 3

1 aaa bbb 4

2 aaa ccc 3

2 aaa bbb 4

2 ccc aaa 3

This is my process:

<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve 17-12-21-Spon_10" width="90" x="45" y="34">
        <parameter key="repository_entry" value="../data/17-12-21-Spon_10"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data Spon" width="90" x="179" y="34">
        <parameter key="vector_creation" value="Term Frequency"/>
        <parameter key="add_meta_information" value="false"/>
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_method" value="by ranking"/>
        <parameter key="prune_below_absolute" value="10"/>
        <parameter key="prune_above_absolute" value="3000"/>
        <parameter key="data_management" value="memory-optimized"/>
        <parameter key="select_attributes_and_weights" value="true"/>
        <list key="specify_weights">
          <parameter key="link" value="1.0"/>
        </list>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="45" y="34">
            <parameter key="minimum_text_block_length" value="2"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize Token" width="90" x="179" y="34">
            <parameter key="mode" value="linguistic tokens"/>
            <parameter key="language" value="German"/>
          </operator>
          <operator activated="true" class="text:filter_tokens_by_content" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens a-zA-Z" width="90" x="313" y="34">
            <parameter key="condition" value="matches"/>
            <parameter key="regular_expression" value="[a-zA-Z]+"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="34"/>
          <connect from_port="document" to_op="Extract Content" to_port="document"/>
          <connect from_op="Extract Content" from_port="document" to_op="Tokenize Token" to_port="document"/>
          <connect from_op="Tokenize Token" from_port="document" to_op="Filter Tokens a-zA-Z" to_port="document"/>
          <connect from_op="Filter Tokens a-zA-Z" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="subprocess" compatibility="8.0.001" expanded="true" height="103" name="Splitting" width="90" x="313" y="34">
        <process expanded="true">
          <operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="34">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="text"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="generate_id" compatibility="8.0.001" expanded="true" height="82" name="Generate ID" width="90" x="45" y="136"/>
          <operator activated="true" class="rename" compatibility="8.0.001" expanded="true" height="82" name="Rename ID" width="90" x="45" y="238">
            <parameter key="old_name" value="id"/>
            <parameter key="new_name" value="Document"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role" width="90" x="45" y="340">
            <parameter key="attribute_name" value="text"/>
            <parameter key="target_role" value="label"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="rename" compatibility="8.0.001" expanded="true" height="82" name="Rename" width="90" x="179" y="34">
            <parameter key="old_name" value="text"/>
            <parameter key="new_name" value="word"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="split" compatibility="8.0.001" expanded="true" height="82" name="Split" width="90" x="179" y="136">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="word"/>
            <parameter key="include_special_attributes" value="true"/>
            <parameter key="split_pattern" value="\s+"/>
          </operator>
          <operator activated="true" class="transpose" compatibility="8.0.001" expanded="true" height="82" name="Splitting Output" width="90" x="313" y="34"/>
          <connect from_port="in 1" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_op="Rename ID" to_port="example set input"/>
          <connect from_op="Rename ID" from_port="example set output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_op="Rename" to_port="example set input"/>
          <connect from_op="Rename" from_port="example set output" to_op="Split" to_port="example set input"/>
          <connect from_op="Split" from_port="example set output" to_op="Splitting Output" to_port="example set input"/>
          <connect from_op="Split" from_port="original" to_port="out 2"/>
          <connect from_op="Splitting Output" from_port="example set output" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
          <portSpacing port="sink_out 3" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="subprocess" compatibility="8.0.001" expanded="true" height="82" name="Neighborhood co-ocurrence" width="90" x="447" y="34">
        <process expanded="true">
          <operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Generate Bigrams" width="90" x="45" y="34">
            <parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#10;&#9;spon_bigrams &lt;- data %&gt;%&#10;&#9;  unnest_tokens(bigram, word, token = &quot;ngrams&quot;, n = 2)&#10;&#9;print(spon_bigrams)&#10;&#10;    return(list(spon_bigrams))    &#10;}&#10;"/>
          </operator>
          <operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Seperate Bigrams" width="90" x="179" y="34">
            <parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#9;library(tidyr)&#10;&#9;library(tokenizers)&#10;&#10;&#9;devided_bigrams &lt;-data %&gt;%&#10;&#9; separate(bigram, c(&quot;word1&quot;, &quot;word2&quot;), sep = &quot; &quot;)&#10;&#9; print(devided_bigrams)&#10;&#10;&#9;#bigrams_filtered &lt;- devided_bigrams %&gt;%&#10;&#9;# filter(!word1 %in% stopwords(&quot;de&quot;)) %&gt;%&#10;&#9;# filter(!word2 %in% stopwords(&quot;de&quot;))&#10;&#10;    return(list(devided_bigrams))&#10;    &#10;}&#10;"/>
          </operator>
          <operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Count Bigrams per Page" width="90" x="313" y="34">
            <parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#9;library(tidyr)&#10;&#10;&#9;count_bigrams_per_page &lt;- data %&gt;%&#10;&#9;  count(Document, word1, word2)&#10;&#9;print(count_bigrams_per_page)&#10;&#10;&#9;counted_bigrams_per_page &lt;- data.frame(count_bigrams_per_page)&#10;&#10;    return(counted_bigrams_per_page)&#10;}&#10;"/>
          </operator>
          <connect from_port="in 1" to_op="Generate Bigrams" to_port="input 1"/>
          <connect from_op="Generate Bigrams" from_port="output 1" to_op="Seperate Bigrams" to_port="input 1"/>
          <connect from_op="Seperate Bigrams" from_port="output 1" to_op="Count Bigrams per Page" to_port="input 1"/>
          <connect from_op="Count Bigrams per Page" from_port="output 1" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role (2)" width="90" x="581" y="34">
        <parameter key="attribute_name" value="Document"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="715" y="34">
        <list key="comparison_groups"/>
      </operator>
      <operator activated="true" class="support_vector_machine" compatibility="8.0.001" expanded="true" height="124" name="SVM" width="90" x="849" y="34"/>
      <operator activated="true" class="subprocess" compatibility="8.0.001" expanded="true" height="82" name="Crawler Spon" width="90" x="45" y="187">
        <process expanded="true">
          <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="112" y="34">
            <parameter key="url" value="http://www.spiegel.de"/>
            <list key="crawling_rules">
              <parameter key="store_with_matching_url" value=".+www.spiegel.+"/>
              <parameter key="follow_link_with_matching_url" value=".+spiegel.+|.+de.+"/>
            </list>
            <parameter key="max_crawl_depth" value="10"/>
            <parameter key="retrieve_as_html" value="true"/>
            <parameter key="add_content_as_attribute" value="true"/>
            <parameter key="max_pages" value="10"/>
            <parameter key="delay" value="100"/>
            <parameter key="max_concurrent_connections" value="200"/>
            <parameter key="max_connections_per_host" value="100"/>
            <parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"/>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="246" y="34">
            <parameter key="link_attribute" value="Link"/>
            <parameter key="page_attribute" value="link"/>
            <parameter key="random_user_agent" value="true"/>
          </operator>
          <connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data Spon (2)" width="90" x="179" y="187">
        <parameter key="vector_creation" value="Term Frequency"/>
        <parameter key="add_meta_information" value="false"/>
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_method" value="by ranking"/>
        <parameter key="prune_below_absolute" value="10"/>
        <parameter key="prune_above_absolute" value="3000"/>
        <parameter key="data_management" value="memory-optimized"/>
        <parameter key="select_attributes_and_weights" value="true"/>
        <list key="specify_weights">
          <parameter key="link" value="1.0"/>
        </list>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content (2)" width="90" x="45" y="34">
            <parameter key="minimum_text_block_length" value="2"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize Token (2)" width="90" x="179" y="34">
            <parameter key="mode" value="linguistic tokens"/>
            <parameter key="language" value="German"/>
          </operator>
          <operator activated="true" class="text:filter_tokens_by_content" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens a-zA-Z (2)" width="90" x="313" y="34">
            <parameter key="condition" value="matches"/>
            <parameter key="regular_expression" value="[a-zA-Z]+"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="447" y="34"/>
          <connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
          <connect from_op="Extract Content (2)" from_port="document" to_op="Tokenize Token (2)" to_port="document"/>
          <connect from_op="Tokenize Token (2)" from_port="document" to_op="Filter Tokens a-zA-Z (2)" to_port="document"/>
          <connect from_op="Filter Tokens a-zA-Z (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="subprocess" compatibility="8.0.001" expanded="true" height="103" name="Splitting (2)" width="90" x="313" y="187">
        <process expanded="true">
          <operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="45" y="34">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="text"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="generate_id" compatibility="8.0.001" expanded="true" height="82" name="Generate ID (2)" width="90" x="45" y="136"/>
          <operator activated="true" class="rename" compatibility="8.0.001" expanded="true" height="82" name="Rename ID (2)" width="90" x="45" y="238">
            <parameter key="old_name" value="id"/>
            <parameter key="new_name" value="Document"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role (3)" width="90" x="45" y="340">
            <parameter key="attribute_name" value="text"/>
            <parameter key="target_role" value="label"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="rename" compatibility="8.0.001" expanded="true" height="82" name="Rename (2)" width="90" x="179" y="34">
            <parameter key="old_name" value="text"/>
            <parameter key="new_name" value="word"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="split" compatibility="8.0.001" expanded="true" height="82" name="Split (2)" width="90" x="179" y="136">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="word"/>
            <parameter key="include_special_attributes" value="true"/>
            <parameter key="split_pattern" value="\s+"/>
          </operator>
          <operator activated="true" class="transpose" compatibility="8.0.001" expanded="true" height="82" name="Splitting Output (2)" width="90" x="313" y="34"/>
          <connect from_port="in 1" to_op="Select Attributes (2)" to_port="example set input"/>
          <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Generate ID (2)" to_port="example set input"/>
          <connect from_op="Generate ID (2)" from_port="example set output" to_op="Rename ID (2)" to_port="example set input"/>
          <connect from_op="Rename ID (2)" from_port="example set output" to_op="Set Role (3)" to_port="example set input"/>
          <connect from_op="Set Role (3)" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
          <connect from_op="Rename (2)" from_port="example set output" to_op="Split (2)" to_port="example set input"/>
          <connect from_op="Split (2)" from_port="example set output" to_op="Splitting Output (2)" to_port="example set input"/>
          <connect from_op="Split (2)" from_port="original" to_port="out 2"/>
          <connect from_op="Splitting Output (2)" from_port="example set output" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
          <portSpacing port="sink_out 3" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="subprocess" compatibility="8.0.001" expanded="true" height="82" name="Neighborhood co-ocurrence (2)" width="90" x="447" y="187">
        <process expanded="true">
          <operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Generate Bigrams (2)" width="90" x="45" y="34">
            <parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#10;&#9;spon_bigrams &lt;- data %&gt;%&#10;&#9;  unnest_tokens(bigram, word, token = &quot;ngrams&quot;, n = 2)&#10;&#9;print(spon_bigrams)&#10;&#10;    return(list(spon_bigrams))    &#10;}&#10;"/>
          </operator>
          <operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Seperate Bigrams (2)" width="90" x="179" y="34">
            <parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#9;library(tidyr)&#10;&#9;library(tokenizers)&#10;&#10;&#9;devided_bigrams &lt;-data %&gt;%&#10;&#9; separate(bigram, c(&quot;word1&quot;, &quot;word2&quot;), sep = &quot; &quot;)&#10;&#9; print(devided_bigrams)&#10;&#10;&#9;#bigrams_filtered &lt;- devided_bigrams %&gt;%&#10;&#9;# filter(!word1 %in% stopwords(&quot;de&quot;)) %&gt;%&#10;&#9;# filter(!word2 %in% stopwords(&quot;de&quot;))&#10;&#10;    return(list(devided_bigrams))&#10;    &#10;}&#10;"/>
          </operator>
          <operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Count Bigrams per Page (2)" width="90" x="313" y="34">
            <parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#9;library(tidyr)&#10;&#10;&#9;count_bigrams_per_page &lt;- data %&gt;%&#10;&#9;  count(Document, word1, word2)&#10;&#9;print(count_bigrams_per_page)&#10;&#10;&#9;counted_bigrams_per_page &lt;- data.frame(count_bigrams_per_page)&#10;&#10;    return(counted_bigrams_per_page)&#10;}&#10;"/>
          </operator>
          <connect from_port="in 1" to_op="Generate Bigrams (2)" to_port="input 1"/>
          <connect from_op="Generate Bigrams (2)" from_port="output 1" to_op="Seperate Bigrams (2)" to_port="input 1"/>
          <connect from_op="Seperate Bigrams (2)" from_port="output 1" to_op="Count Bigrams per Page (2)" to_port="input 1"/>
          <connect from_op="Count Bigrams per Page (2)" from_port="output 1" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role (4)" width="90" x="581" y="187">
        <parameter key="attribute_name" value="Document"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical (2)" width="90" x="715" y="187">
        <list key="comparison_groups"/>
      </operator>
      <operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model" width="90" x="983" y="136">
        <list key="application_parameters"/>
      </operator>
      <connect from_op="Retrieve 17-12-21-Spon_10" from_port="output" to_op="Process Documents from Data Spon" to_port="example set"/>
      <connect from_op="Process Documents from Data Spon" from_port="example set" to_op="Splitting" to_port="in 1"/>
      <connect from_op="Splitting" from_port="out 2" to_op="Neighborhood co-ocurrence" to_port="in 1"/>
      <connect from_op="Neighborhood co-ocurrence" from_port="out 1" to_op="Set Role (2)" to_port="example set input"/>
      <connect from_op="Set Role (2)" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
      <connect from_op="Nominal to Numerical" from_port="example set output" to_op="SVM" to_port="training set"/>
      <connect from_op="SVM" from_port="model" to_op="Apply Model" to_port="model"/>
      <connect from_op="Crawler Spon" from_port="out 1" to_op="Process Documents from Data Spon (2)" to_port="example set"/>
      <connect from_op="Process Documents from Data Spon (2)" from_port="example set" to_op="Splitting (2)" to_port="in 1"/>
      <connect from_op="Splitting (2)" from_port="out 2" to_op="Neighborhood co-ocurrence (2)" to_port="in 1"/>
      <connect from_op="Neighborhood co-ocurrence (2)" from_port="out 1" to_op="Set Role (4)" to_port="example set input"/>
      <connect from_op="Set Role (4)" from_port="example set output" to_op="Nominal to Numerical (2)" to_port="example set input"/>
      <connect from_op="Nominal to Numerical (2)" from_port="example set output" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

I’m out of ideas how to compare and analyse them.

Please, has someone an idea how I can do this?

Regards

Tobias

MartinLiebig · December 2017

Hi @TobiasNehrig,

are these texts or tupels you are working on? And does the order matter? I guess the solution is something like Pivot + Cross Distance or Aggregate + Cross Distance. But the precise solution depends on your use case.

Cheers,

Martin

TobiasNehrig · December 2017

Hi @mschmitz,

in my understanding these should be Tupels.

Regards

Tobias

MartinLiebig · December 2017

Ok,

I would concat the two words, Pivot, Replace Missings with 0 and use Cross Distance.

Best,

Martin

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

compare and analysis text documents

Answers