Cut Document

CaptainChaos · September 2011

Hi Guys,

I think i have some kind of trivial problem but couldnt figure out how to solve it.

I am working with the reuters Dataset, i have a steemed version consisting of one big docuement which contains all the other documents. So it is a big .txt file in which the beginning and ending of each document is marked by the word "reuter". I tried to use the "Cut Document" operator to split them. As query expression I used "reuters" the problem is that all documents know have the same name(label) which makes it hard to work with them.

Does anybody know how to give different names to all documents like 1,2,3,4,5 for example and than write/export them to excell or a data base.

Thanky in advance
cheer

colo · September 2011

Hi,

here is a little example of how you could write the single documents as files:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.011">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.011" expanded="true" name="Process">
    <process expanded="true" height="607" width="758">
      <operator activated="true" class="text:create_document" compatibility="5.1.001" expanded="true" height="60" name="Create Document" width="90" x="45" y="30">
        <parameter key="text" value="marker&#10;Document 1 content&#10;marker&#10;marker&#10;Document 2 content&#10;marker&#10;marker&#10;Document 3 content&#10;marker&#10;marker&#10;Document 4 content&#10;marker&#10;marker&#10;Document 5 content&#10;marker"/>
      </operator>
      <operator activated="true" class="text:cut_document" compatibility="5.1.001" expanded="true" height="60" name="Cut Document" width="90" x="179" y="30">
        <parameter key="query_type" value="Regular Expression"/>
        <list key="string_machting_queries">
          <parameter key="content" value="marker.marker"/>
        </list>
        <list key="regular_expression_queries">
          <parameter key="content" value="marker\s*(.*?)\s*marker"/>
        </list>
        <list key="regular_region_queries"/>
        <list key="xpath_queries"/>
        <list key="namespaces"/>
        <list key="index_queries"/>
        <process expanded="true" height="607" width="758">
          <connect from_port="segment" to_port="document 1"/>
          <portSpacing port="source_segment" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="loop_collection" compatibility="5.1.011" expanded="true" height="76" name="Loop Collection" width="90" x="313" y="30">
        <process expanded="true" height="607" width="758">
          <operator activated="true" class="text:write_document" compatibility="5.1.001" expanded="true" height="60" name="Write Document" width="90" x="45" y="30">
            <parameter key="file" value="C:\Dokumente und Einstellungen\mraeder\Desktop\output\document_%{a}.txt"/>
          </operator>
          <connect from_port="single" to_op="Write Document" to_port="document"/>
          <connect from_op="Write Document" from_port="document" to_port="output 1"/>
          <portSpacing port="source_single" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Create Document" from_port="output" to_op="Cut Document" to_port="document"/>
      <connect from_op="Cut Document" from_port="documents" to_op="Loop Collection" to_port="collection"/>
      <connect from_op="Loop Collection" from_port="output 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

If you prefer a list-based output like Excel or database, this is the way to go:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.011">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.011" expanded="true" name="Process">
    <process expanded="true" height="607" width="758">
      <operator activated="true" class="text:create_document" compatibility="5.1.001" expanded="true" height="60" name="Create Document" width="90" x="45" y="30">
        <parameter key="text" value="marker&#10;Document 1 content&#10;marker&#10;marker&#10;Document 2 content&#10;marker&#10;marker&#10;Document 3 content&#10;marker&#10;marker&#10;Document 4 content&#10;marker&#10;marker&#10;Document 5 content&#10;marker"/>
      </operator>
      <operator activated="true" class="text:cut_document" compatibility="5.1.001" expanded="true" height="60" name="Cut Document" width="90" x="179" y="30">
        <parameter key="query_type" value="Regular Expression"/>
        <list key="string_machting_queries">
          <parameter key="content" value="marker.marker"/>
        </list>
        <list key="regular_expression_queries">
          <parameter key="content" value="marker\s*(.*?)\s*marker"/>
        </list>
        <list key="regular_region_queries"/>
        <list key="xpath_queries"/>
        <list key="namespaces"/>
        <list key="index_queries"/>
        <process expanded="true" height="607" width="758">
          <connect from_port="segment" to_port="document 1"/>
          <portSpacing port="source_segment" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:documents_to_data" compatibility="5.1.001" expanded="true" height="76" name="Documents to Data" width="90" x="313" y="30">
        <parameter key="text_attribute" value="content"/>
        <parameter key="add_meta_information" value="false"/>
      </operator>
      <operator activated="true" class="generate_id" compatibility="5.1.011" expanded="true" height="76" name="Generate ID" width="90" x="447" y="30"/>
      <operator activated="true" class="generate_attributes" compatibility="5.1.011" expanded="true" height="76" name="Generate Attributes" width="90" x="581" y="30">
        <list key="function_descriptions">
          <parameter key="document" value="&quot;document_&quot;  + str(id)"/>
        </list>
      </operator>
      <operator activated="true" class="write_excel" compatibility="5.1.011" expanded="true" height="60" name="Write Excel" width="90" x="581" y="165">
        <parameter key="excel_file" value="C:\Dokumente und Einstellungen\mraeder\Desktop\output\documents.xls"/>
      </operator>
      <connect from_op="Create Document" from_port="output" to_op="Cut Document" to_port="document"/>
      <connect from_op="Cut Document" from_port="documents" to_op="Documents to Data" to_port="documents 1"/>
      <connect from_op="Documents to Data" from_port="example set" to_op="Generate ID" to_port="example set input"/>
      <connect from_op="Generate ID" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
      <connect from_op="Generate Attributes" from_port="example set output" to_op="Write Excel" to_port="input"/>
      <connect from_op="Write Excel" from_port="through" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="126"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Hope these examples help you a little. Feel free to ask if you have further questions.

Regards
Matthias

CaptainChaos · September 2011

Hi Matthias,

first off all thank you very much for your help my model no works a lot better than before.
But I would like to ask you one more question. In the next pic I copied your code and marked one line which is different to my once could you explain the line to me.

_{<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.011">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.011" expanded="true" name="Process">
<process expanded="true" height="607" width="758">
<operator activated="true" class="text:create_document" compatibility="5.1.001" expanded="true" height="60" name="Create Document" width="90" x="45" y="30">
<parameter key="text" value="marker
Document 1 content
marker
marker
Document 2 content
marker
marker
Document 3 content
marker
marker
Document 4 content
marker
marker
Document 5 content
marker"/>
</operator>
<operator activated="true" class="text:cut_document" compatibility="5.1.001" expanded="true" height="60" name="Cut Document" width="90" x="179" y="30">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries">
<parameter key="content" value="marker.marker"/>
</list>
<list key="regular_expression_queries">
<parameter key="content" value="marker\s*(.*?)\s*marker"/> \s*(.*?)\s* --> Plural
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
<process expanded="true" height="607" width="758">
<connect from_port="segment" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="5.1.001" expanded="true" height="76" name="Documents to Data" width="90" x="313" y="30">
<parameter key="text_attribute" value="content"/>
<parameter key="add_meta_information" value="false"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.1.011" expanded="true" height="76" name="Generate ID" width="90" x="447" y="30"/>
<operator activated="true" class="generate_attributes" compatibility="5.1.011" expanded="true" height="76" name="Generate Attributes" width="90" x="581" y="30">
<list key="function_descriptions">
<parameter key="document" value=""document_" + str(id)"/>
</list>
</operator>
<operator activated="true" class="write_excel" compatibility="5.1.011" expanded="true" height="60" name="Write Excel" width="90" x="581" y="165">
<parameter key="excel_file" value="C:\Dokumente und Einstellungen\mraeder\Desktop\output\documents.xls"/>
</operator>
<connect from_op="Create Document" from_port="output" to_op="Cut Document" to_port="document"/>
<connect from_op="Cut Document" from_port="documents" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="126"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>}

CaptainChaos · September 2011

One more Question how can I use an extra window for my code like you did thanks a lot

Kind regards Roberto

colo · September 2011

Hi Roberto,

you can use the code style by adding CODE-tags around it. It's the third symbol from the right just above the smileys.

The highlighted line is my split expression. You said there is a word marking the beginning and the end of each document. Since I manually typed some example document contents, I simply used "marker" for this. I think it should be "reuters" in your case. The regular expression used to cut the text collects anything between two marker words (the first capturing group) and also uses \s* to cut of whitespace surrounding the content (a newline between marker word and beginning of the actual document content for example).

Hope this clarifies things.

Regards
Matthias

CaptainChaos · September 2011

Yes it did thanks a lot.... by the way do you know how i could tell rapidminer in the next step to treat each row in the excell sheet as a seperate document so that i could do some data to similarity or clustering. Thanky in advance for your time

Flake · September 2011

@ Roberto, text in each row is perfect to carry on for further process e.g. clustering. You may refer to this. The excel I read contains two column (id,text), and each row is texts of one document.

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.011">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.011" expanded="true" name="Process">
    <process expanded="true" height="673" width="1299">
      <operator activated="true" class="read_excel" compatibility="5.1.011" expanded="true" height="60" name="Read Excel" width="90" x="149" y="177">
        <list key="annotations"/>
        <list key="data_set_meta_data_information"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.1.002" expanded="true" height="76" name="ProcessDocs Train" width="90" x="514" y="75">
        <parameter key="keep_text" value="true"/>
        <list key="specify_weights"/>
        <process expanded="true" height="655" width="1275">
          <operator activated="true" class="text:transform_cases" compatibility="5.1.002" expanded="true" height="60" name="Transform Cases" width="90" x="204" y="275"/>
          <operator activated="true" class="text:replace_tokens" compatibility="5.1.002" expanded="true" height="60" name="Replace Tokens" width="90" x="259" y="152">
            <list key="replace_dictionary">
              <parameter key="e-mail" value="email"/>
              <parameter key="i'm" value="i am"/>
            </list>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="5.1.002" expanded="true" height="60" name="Tokenize" width="90" x="315" y="30"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.1.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="450" y="30"/>
          <operator activated="true" class="text:stem_snowball" compatibility="5.1.002" expanded="true" height="60" name="Stem (Snowball)" width="90" x="585" y="30"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.1.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="720" y="30">
            <parameter key="min_chars" value="2"/>
            <parameter key="max_chars" value="999"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Replace Tokens" to_port="document"/>
          <connect from_op="Replace Tokens" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="k_means" compatibility="5.1.011" expanded="true" height="76" name="Clustering" width="90" x="849" y="75">
        <parameter key="k" value="6"/>
        <parameter key="max_runs" value="12"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="ProcessDocs Train" to_port="example set"/>
      <connect from_op="ProcessDocs Train" from_port="example set" to_op="Clustering" to_port="example set"/>
      <connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
      <connect from_op="Clustering" from_port="clustered set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="162"/>
    </process>
  </operator>
</process>

Howdy, Stranger!

Quick Links

Categories

Altair RapidMiner Community

GET HELP. LEARN BEST PRACTICES. NETWORK WITH YOUR PEERS.

Cut Document

Answers