The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here

"Extracting text from Youtube[SOLVED]"

geschwadergeschwader Member Posts: 16 Contributor II
edited June 2019 in Help
I am trying to use "Get Pages" operator to extract some text from Youtube pages, but get the following error:
image
Everything is fine with my Internet connection and I can watch YT with my browser.

Answers

  • Nils_WoehlerNils_Woehler Member Posts: 463 Maven
    Hi,

    can you please post a example process according to http://rapid-i.com/rapidforum/index.php/topic,4654.0.html ?
    If i just use the Get Pages operator everything works fine for me.

    Best,
    Nils
  • geschwadergeschwader Member Posts: 16 Contributor II
    Ok, sorry. Here it is:
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.001" expanded="true" name="Process">
        <process expanded="true" height="386" width="705">
          <operator activated="true" class="read_csv" compatibility="5.2.001" expanded="true" height="60" name="Read CSV" width="90" x="45" y="30">
            <parameter key="csv_file" value="C:\Users\Олег\Desktop\КА НаУКМА\Медіація\Links.csv"/>
            <parameter key="first_row_as_names" value="false"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <parameter key="encoding" value="windows-1251"/>
            <list key="data_set_meta_data_information"/>
          </operator>
          <operator activated="true" class="text:generate_extract" compatibility="5.2.001" expanded="true" height="60" name="Generate Extract" width="90" x="179" y="30">
            <parameter key="source_attribute" value="Links"/>
            <list key="string_machting_queries">
              <parameter key="Links2" value=".&amp;"/>
            </list>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries"/>
            <list key="xpath_queries"/>
            <list key="namespaces"/>
            <list key="index_queries"/>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="5.1.004" expanded="true" height="60" name="Get Pages" width="90" x="45" y="120">
            <parameter key="link_attribute" value="Links2"/>
            <parameter key="page_attribute" value="Page"/>
            <parameter key="user_agent" value="Opera"/>
            <parameter key="accept_cookies" value="all"/>
          </operator>
          <operator activated="true" class="text:generate_extract" compatibility="5.2.001" expanded="true" height="60" name="Generate Extract (2)" width="90" x="45" y="210">
            <parameter key="source_attribute" value="Page"/>
            <list key="string_machting_queries">
              <parameter key="Тривалість" value="&quot;length_seconds&quot;: .,"/>
              <parameter key="Дата" value="&lt;span id=&quot;eow-date&quot; class=&quot;watch-video-date&quot; &gt;.&lt;/span&gt;"/>
              <parameter key="Опис" value="&lt;p id=&quot;eow-description&quot; &gt;.&lt;/p&gt;"/>
              <parameter key="Заголовок" value="&lt;span id=&quot;eow-title&quot; class=&quot;long-title&quot; dir=&quot;ltr&quot; title=&quot;.&quot;&gt;"/>
              <parameter key="Likes" value="&lt;span class=&quot;likes&quot;&gt;.&lt;/span&gt;"/>
              <parameter key="Dislikes" value="&lt;span class=&quot;dislikes&quot;&gt;.&lt;/span&gt;"/>
              <parameter key="Кількість коментарів" value="&lt;span class=&quot;comments-section-stat&quot;&gt;(.)&lt;/span&gt;"/>
              <parameter key="Кількість переглядів" value="&lt;span class=&quot;watch-view-count&quot;&gt;    &lt;strong&gt;.&lt;/strong&gt;"/>
            </list>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries"/>
            <list key="xpath_queries"/>
            <list key="namespaces"/>
            <list key="index_queries"/>
          </operator>
          <operator activated="true" class="replace" compatibility="5.2.001" expanded="true" height="76" name="Replace" width="90" x="179" y="165">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="Post"/>
            <parameter key="replace_what" value="&lt;(.*?)&gt;"/>
          </operator>
          <operator activated="true" class="replace" compatibility="5.2.001" expanded="true" height="76" name="Replace (2)" width="90" x="313" y="165">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="Post"/>
            <parameter key="replace_what" value="&amp;(.*?);"/>
            <parameter key="replace_by" value=" "/>
          </operator>
          <operator activated="true" class="write_csv" compatibility="5.2.001" expanded="true" height="76" name="Write CSV" width="90" x="514" y="75">
            <parameter key="csv_file" value="C:\Users\Олег\Desktop\КА НаУКМА\Медіація\Youtube_extract.csv"/>
          </operator>
          <connect from_op="Read CSV" from_port="output" to_op="Generate Extract" to_port="Example Set"/>
          <connect from_op="Generate Extract" from_port="Example Set" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_op="Generate Extract (2)" to_port="Example Set"/>
          <connect from_op="Generate Extract (2)" from_port="Example Set" to_op="Replace" to_port="example set input"/>
          <connect from_op="Replace" from_port="example set output" to_op="Replace (2)" to_port="example set input"/>
          <connect from_op="Replace (2)" from_port="example set output" to_op="Write CSV" to_port="input"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
        </process>
      </operator>
    </process>
    And the Links.csv file:
    http://usic.org.ua/upload/8f4ea2c9c93f56c4d624d97b63a81b8c4ebad2f3/Links.csv
    Thank you in advance for your help.
  • Nils_WoehlerNils_Woehler Member Posts: 463 Maven
    Hi,

    there seems to be a problem in your .csv file. The last entry has a white space at the beginning which is not allowed. If you remove the white space the process works fine.

    Best,
    Nils
  • scepxkoscepxko Member Posts: 15 Maven
    Hi,

    it seems you named attributes using Eastern-Europe/Cyrillic character set, which gives a very funky "Youtube_extract.csv" at the end :-)
    in the last line of your CSV there's this blank space mentionned by Nils before the "http://", which caused an error for me too. I removed the last empty line too.

    have a nice day!
    Alex

  • geschwadergeschwader Member Posts: 16 Contributor II
    Well, now it doesn't produce an error for "Get Pages" operator, but still doesn't extract pages content. I've simplifyed the process:
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.001" expanded="true" name="Process">
        <process expanded="true" height="386" width="705">
          <operator activated="true" class="read_csv" compatibility="5.2.001" expanded="true" height="60" name="Read CSV" width="90" x="33" y="31">
            <parameter key="csv_file" value="C:\Users\Олег\Desktop\КА НаУКМА\Медіація\Links2.csv"/>
            <parameter key="first_row_as_names" value="false"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <parameter key="encoding" value="windows-1251"/>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="Links.true.binominal.attribute"/>
            </list>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="5.1.004" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
            <parameter key="link_attribute" value="Links2"/>
            <parameter key="page_attribute" value="Page"/>
            <parameter key="user_agent" value="Opera"/>
            <parameter key="follow_redirects" value="false"/>
            <parameter key="accept_cookies" value="all"/>
          </operator>
          <operator activated="true" class="write_csv" compatibility="5.2.001" expanded="true" height="76" name="Write CSV" width="90" x="313" y="30">
            <parameter key="csv_file" value="C:\Users\Олег\Desktop\КА НаУКМА\Медіація\Youtube_extract.csv"/>
          </operator>
          <connect from_op="Read CSV" from_port="output" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_op="Write CSV" to_port="input"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
        </process>
      </operator>
    </process>
    Links2.csv file:
    http://usic.org.ua/upload/2abf7ee83587eab166bf9e956e38b95df362fea4/Links2.csv
    And here what I get:
    http://usic.org.ua/upload/e5a607d60d4d88f4ed9a43e678ea03d2b8ba718a/Youtube_extract.csv
    No pages extracted.
  • scepxkoscepxko Member Posts: 15 Maven
    it seems the extraction operators are missing in your second version which contains only 3 operators (against 7 for the first one posted before)
    maybe could you explain what you're trying to do? :-)
  • geschwadergeschwader Member Posts: 16 Contributor II
    In the second example I just want Rapidminer to save those YT pages in html. I.e. to have CSV file with URL and Page content attributes. Then I'll be able to perform various extract procedures with Page attribute.
    But I simply doesn't have page extracted!
  • scepxkoscepxko Member Posts: 15 Maven
    Take a look at the code below and transform it for your own purposes
    It reads the "links_to_check.xls" which contains the raw URL list to save
    Then saves each page in HTML in a directory for further use.

    Maybe isn't the coding very elegant but it works for what i needed.
    Some parts of the code were taken here and there.
    Alex
    PS : I just started to learn RapidMiner coding 2 days ago. Ask Nils for complicated questions :-)

    ______________________________________________________
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.001" expanded="true" name="Process">
        <process expanded="true" height="611" width="949">
          <operator activated="true" class="read_excel" compatibility="5.2.001" expanded="true" height="60" name="Read Excel" width="90" x="45" y="75">
            <parameter key="excel_file" value="E:\Rapidminer\reuters\links_to_check.xls"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="text.true.polynominal.attribute"/>
            </list>
          </operator>
          <operator activated="true" class="nominal_to_text" compatibility="5.2.001" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="75">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="text"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Data" width="90" x="313" y="75">
            <parameter key="add_meta_information" value="false"/>
            <parameter key="keep_text" value="true"/>
            <list key="specify_weights"/>
            <process expanded="true" height="753" width="1094">
              <connect from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="loop_examples" compatibility="5.2.001" expanded="true" height="112" name="Loop Examples" width="90" x="447" y="75">
            <process expanded="true" height="753" width="681">
              <operator activated="true" class="delay" compatibility="5.2.001" expanded="true" height="76" name="Delay" width="90" x="112" y="30">
                <parameter key="delay" value="random"/>
                <parameter key="min_delay_amount" value="500"/>
              </operator>
              <operator activated="true" class="extract_macro" compatibility="5.2.001" expanded="true" height="60" name="Extract Macro" width="90" x="315" y="30">
                <parameter key="macro" value="website_url"/>
                <parameter key="macro_type" value="data_value"/>
                <parameter key="attribute_name" value="text"/>
                <parameter key="example_index" value="%{example}"/>
              </operator>
              <operator activated="true" class="web:get_webpage" compatibility="5.1.004" expanded="true" height="60" name="Get Page" width="90" x="112" y="165">
                <parameter key="url" value="%{website_url}"/>
                <parameter key="random_user_agent" value="true"/>
                <parameter key="accept_cookies" value="all"/>
                <list key="query_parameters"/>
              </operator>
              <operator activated="true" class="text:write_document" compatibility="5.2.001" expanded="true" height="60" name="Write Document" width="90" x="313" y="165">
                <parameter key="file" value="E:\Rapidminer\reuters\pages_to_process\%{example}.html"/>
              </operator>
              <connect from_port="example set" to_op="Delay" to_port="through 1"/>
              <connect from_op="Delay" from_port="through 1" to_op="Extract Macro" to_port="example set"/>
              <connect from_op="Extract Macro" from_port="example set" to_port="example set"/>
              <connect from_op="Get Page" from_port="output" to_op="Write Document" to_port="document"/>
              <connect from_op="Write Document" from_port="document" to_port="output 2"/>
              <portSpacing port="source_example set" spacing="0"/>
              <portSpacing port="sink_example set" spacing="18"/>
              <portSpacing port="sink_output 1" spacing="18"/>
              <portSpacing port="sink_output 2" spacing="0"/>
              <portSpacing port="sink_output 3" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="append" compatibility="5.2.001" expanded="true" height="76" name="Append" width="90" x="648" y="75"/>
          <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
          <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Loop Examples" to_port="example set"/>
          <connect from_op="Loop Examples" from_port="output 1" to_op="Append" to_port="example set 1"/>
          <connect from_op="Append" from_port="merged set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
  • geschwadergeschwader Member Posts: 16 Contributor II
    scepxko wrote:

    Take a look at the code below and transform it for your own purposes
    It reads the "links_to_check.xls" which contains the raw URL list to save
    Then saves each page in HTML in a directory for further use.

    Maybe isn't the coding very elegant but it works for what i needed.
    Some parts of the code were taken here and there.
    Alex
    PS : I just started to learn RapidMiner coding 2 days ago. Ask Nils for complicated questions :-)

    ______________________________________________________
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.001" expanded="true" name="Process">
        <process expanded="true" height="611" width="949">
          <operator activated="true" class="read_excel" compatibility="5.2.001" expanded="true" height="60" name="Read Excel" width="90" x="45" y="75">
            <parameter key="excel_file" value="E:\Rapidminer\reuters\links_to_check.xls"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="text.true.polynominal.attribute"/>
            </list>
          </operator>
          <operator activated="true" class="nominal_to_text" compatibility="5.2.001" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="75">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="text"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Data" width="90" x="313" y="75">
            <parameter key="add_meta_information" value="false"/>
            <parameter key="keep_text" value="true"/>
            <list key="specify_weights"/>
            <process expanded="true" height="753" width="1094">
              <connect from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="loop_examples" compatibility="5.2.001" expanded="true" height="112" name="Loop Examples" width="90" x="447" y="75">
            <process expanded="true" height="753" width="681">
              <operator activated="true" class="delay" compatibility="5.2.001" expanded="true" height="76" name="Delay" width="90" x="112" y="30">
                <parameter key="delay" value="random"/>
                <parameter key="min_delay_amount" value="500"/>
              </operator>
              <operator activated="true" class="extract_macro" compatibility="5.2.001" expanded="true" height="60" name="Extract Macro" width="90" x="315" y="30">
                <parameter key="macro" value="website_url"/>
                <parameter key="macro_type" value="data_value"/>
                <parameter key="attribute_name" value="text"/>
                <parameter key="example_index" value="%{example}"/>
              </operator>
              <operator activated="true" class="web:get_webpage" compatibility="5.1.004" expanded="true" height="60" name="Get Page" width="90" x="112" y="165">
                <parameter key="url" value="%{website_url}"/>
                <parameter key="random_user_agent" value="true"/>
                <parameter key="accept_cookies" value="all"/>
                <list key="query_parameters"/>
              </operator>
              <operator activated="true" class="text:write_document" compatibility="5.2.001" expanded="true" height="60" name="Write Document" width="90" x="313" y="165">
                <parameter key="file" value="E:\Rapidminer\reuters\pages_to_process\%{example}.html"/>
              </operator>
              <connect from_port="example set" to_op="Delay" to_port="through 1"/>
              <connect from_op="Delay" from_port="through 1" to_op="Extract Macro" to_port="example set"/>
              <connect from_op="Extract Macro" from_port="example set" to_port="example set"/>
              <connect from_op="Get Page" from_port="output" to_op="Write Document" to_port="document"/>
              <connect from_op="Write Document" from_port="document" to_port="output 2"/>
              <portSpacing port="source_example set" spacing="0"/>
              <portSpacing port="sink_example set" spacing="18"/>
              <portSpacing port="sink_output 1" spacing="18"/>
              <portSpacing port="sink_output 2" spacing="0"/>
              <portSpacing port="sink_output 3" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="append" compatibility="5.2.001" expanded="true" height="76" name="Append" width="90" x="648" y="75"/>
          <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
          <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Loop Examples" to_port="example set"/>
          <connect from_op="Loop Examples" from_port="output 1" to_op="Append" to_port="example set 1"/>
          <connect from_op="Append" from_port="merged set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    So, where is GET operator in your process? How do you download those pages?
  • scepxkoscepxko Member Posts: 15 Maven
    it's in the Loop Examples operator, since you have to repeat the GET for each list item.
    You can try it, it works
  • geschwadergeschwader Member Posts: 16 Contributor II
    I'd prefer some comments on my last process  :) It's much more simpler and doesn't require any intermediate stages with pages saving. Why it doesn't work?
  • Nils_WoehlerNils_Woehler Member Posts: 463 Maven
    Hi,

    there is a problem with your process. The "Read CSV" result set has no attribute called "Links2" but an attribute called "Links". Change the "link attribute" attribute from "Get Pages" from Links2 to Links and it should work.
    But still there should be an error instead of just shown an empty result set..

    *edit* With the next update an error will be thrown if the selected attribute does not exists.

    Best,
    Nils
  • geschwadergeschwader Member Posts: 16 Contributor II
    With this process I've managed to get what I want:
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.001" expanded="true" name="Process">
        <process expanded="true" height="431" width="710">
          <operator activated="true" class="read_csv" compatibility="5.2.001" expanded="true" height="60" name="Read CSV" width="90" x="45" y="30">
            <parameter key="csv_file" value="C:\Users\Олег\Desktop\КА НаУКМА\Медіація\Links.csv"/>
            <parameter key="first_row_as_names" value="false"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <parameter key="encoding" value="windows-1251"/>
            <list key="data_set_meta_data_information"/>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="5.1.004" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
            <parameter key="link_attribute" value="Links"/>
            <parameter key="page_attribute" value="Page"/>
            <parameter key="user_agent" value="Opera"/>
            <parameter key="accept_cookies" value="all"/>
          </operator>
          <operator activated="true" class="text:generate_extract" compatibility="5.2.001" expanded="true" height="60" name="Generate Extract (2)" width="90" x="45" y="210">
            <parameter key="source_attribute" value="Page"/>
            <list key="string_machting_queries">
              <parameter key="Тривалість" value="&quot;length_seconds&quot;: .,"/>
              <parameter key="Дата" value="&lt;span id=&quot;eow-date&quot; class=&quot;watch-video-date&quot; &gt;.&lt;/span&gt;"/>
              <parameter key="Опис" value="&lt;p id=&quot;eow-description&quot; &gt;.&lt;/p&gt;"/>
              <parameter key="Заголовок" value="&lt;span id=&quot;eow-title&quot; class=&quot;long-title&quot; dir=&quot;ltr&quot; title=&quot;.&quot;&gt;"/>
              <parameter key="Likes" value="&lt;span class=&quot;likes&quot;&gt;.&lt;/span&gt;"/>
              <parameter key="Dislikes" value="&lt;span class=&quot;dislikes&quot;&gt;.&lt;/span&gt;"/>
              <parameter key="Кількість коментарів" value="&lt;span class=&quot;comments-section-stat&quot;&gt;(.)&lt;/span&gt;"/>
              <parameter key="Кількість переглядів" value="span class=&quot;watch-view-count&quot;&gt;.&lt;/strong&gt;"/>
            </list>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries"/>
            <list key="xpath_queries"/>
            <list key="namespaces"/>
            <list key="index_queries"/>
          </operator>
          <operator activated="true" class="replace" compatibility="5.2.001" expanded="true" height="76" name="Replace" width="90" x="179" y="165">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="Опис"/>
            <parameter key="replace_what" value="&lt;(.*?)&gt;"/>
          </operator>
          <operator activated="true" class="replace" compatibility="5.2.001" expanded="true" height="76" name="Replace (2)" width="90" x="313" y="165">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="Кількість переглядів"/>
            <parameter key="replace_what" value="&lt;(.*?)&gt;"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.2.001" expanded="true" height="76" name="Select Attributes" width="90" x="380" y="30">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="Dislikes|Likes|Дата|Заголовок|Кількість коментарів|Кількість переглядів|Опис|Тривалість|"/>
          </operator>
          <operator activated="true" class="write_csv" compatibility="5.2.001" expanded="true" height="76" name="Write CSV" width="90" x="514" y="75">
            <parameter key="csv_file" value="C:\Users\Олег\Desktop\КА НаУКМА\Медіація\Youtube_extract.csv"/>
          </operator>
          <connect from_op="Read CSV" from_port="output" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_op="Generate Extract (2)" to_port="Example Set"/>
          <connect from_op="Generate Extract (2)" from_port="Example Set" to_op="Replace" to_port="example set input"/>
          <connect from_op="Replace" from_port="example set output" to_op="Replace (2)" to_port="example set input"/>
          <connect from_op="Replace (2)" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Write CSV" to_port="input"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
        </process>
      </operator>
    </process>
    The problem, among others, was that .csv column separator
    ;
    was present in "Page" attribute, so I couldn't saw any meaningful result.
    Thank you for your support.
Sign In or Register to comment.