The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
create wordlist for a list of URLs
davidellis
Member Posts: 4 Contributor I
in Help
I have a list of URLs that I want to scrape and create a word list. I can easily do this for all the URLs and how do I load a whole bunch and get a wordlist for each one and export to excel
here is the simple code to get the wordlist for all combined
here is the simple code to get the wordlist for all combined
<?xml version="1.0" encoding="UTF-8"?>
-<process version="9.9.002">
-<context>
<input/>
<output/>
<macros/>
</context>
-<operator name="Process" expanded="true" compatibility="9.9.002" class="process" activated="true">
<parameter value="init" key="logverbosity"/>
<parameter value="2001" key="random_seed"/>
<parameter value="never" key="send_mail"/>
<parameter value="" key="notification_email"/>
<parameter value="30" key="process_duration_for_mail"/>
<parameter value="SYSTEM" key="encoding"/>
-<process expanded="true">
-<operator name="Read CSV" expanded="true" compatibility="9.9.002" class="read_csv" activated="true" y="85" x="112" width="90" height="68">
<parameter value="C:/Users/david/Desktop/rapidminer test.csv" key="csv_file"/>
<parameter value=";" key="column_separators"/>
<parameter value="false" key="trim_lines"/>
<parameter value="true" key="use_quotes"/>
<parameter value=""" key="quotes_character"/>
<parameter value="\" key="escape_character"/>
<parameter value="false" key="skip_comments"/>
<parameter value="#" key="comment_characters"/>
<parameter value="1" key="starting_row"/>
<parameter value="true" key="parse_numbers"/>
<parameter value="." key="decimal_character"/>
<parameter value="false" key="grouped_digits"/>
<parameter value="," key="grouping_character"/>
<parameter value="" key="infinity_representation"/>
<parameter value="" key="date_format"/>
<parameter value="true" key="first_row_as_names"/>
<list key="annotations"/>
<parameter value="SYSTEM" key="time_zone"/>
<parameter value="English (United States)" key="locale"/>
<parameter value="SYSTEM" key="encoding"/>
<parameter value="false" key="read_all_values_as_polynominal"/>
<list key="data_set_meta_data_information"/>
<parameter value="true" key="read_not_matching_values_as_missings"/>
</operator>
-<operator name="Get Pages" expanded="true" compatibility="9.3.001" class="web:retrieve_webpages" activated="true" y="85" x="246" width="90" height="68">
<parameter value="NEWURL" key="link_attribute"/>
<parameter value="false" key="random_user_agent"/>
<parameter value="10000" key="connection_timeout"/>
<parameter value="10000" key="read_timeout"/>
<parameter value="true" key="follow_redirects"/>
<parameter value="none" key="accept_cookies"/>
<parameter value="global" key="cookie_scope"/>
<parameter value="GET" key="request_method"/>
<parameter value="none" key="delay"/>
<parameter value="1000" key="delay_amount"/>
<parameter value="0" key="min_delay_amount"/>
<parameter value="1000" key="max_delay_amount"/>
</operator>
-<operator name="Process Documents from Data" expanded="true" compatibility="9.3.001" class="text:process_document_from_data" activated="true" y="85" x="447" width="90" height="82">
<parameter value="true" key="create_word_vector"/>
<parameter value="TF-IDF" key="vector_creation"/>
<parameter value="true" key="add_meta_information"/>
<parameter value="true" key="keep_text"/>
<parameter value="absolute" key="prune_method"/>
<parameter value="3.0" key="prune_below_percent"/>
<parameter value="30.0" key="prune_above_percent"/>
<parameter value="2" key="prune_below_absolute"/>
<parameter value="100000000" key="prune_above_absolute"/>
<parameter value="0.05" key="prune_below_rank"/>
<parameter value="0.95" key="prune_above_rank"/>
<parameter value="double_sparse_array" key="datamanagement"/>
<parameter value="auto" key="data_management"/>
<parameter value="false" key="select_attributes_and_weights"/>
<list key="specify_weights"/>
-<process expanded="true">
-<operator name="Extract Content" expanded="true" compatibility="9.3.001" class="web:extract_html_text_content" activated="true" y="136" x="112" width="90" height="68">
<parameter value="true" key="extract_content"/>
<parameter value="5" key="minimum_text_block_length"/>
<parameter value="true" key="override_content_type_information"/>
<parameter value="true" key="neglegt_span_tags"/>
<parameter value="true" key="neglect_p_tags"/>
<parameter value="true" key="neglect_b_tags"/>
<parameter value="true" key="neglect_i_tags"/>
<parameter value="true" key="neglect_br_tags"/>
<parameter value="true" key="ignore_non_html_tags"/>
</operator>
-<operator name="Tokenize" expanded="true" compatibility="9.3.001" class="text:tokenize" activated="true" y="238" x="112" width="90" height="68">
<parameter value="non letters" key="mode"/>
<parameter value=".:" key="characters"/>
<parameter value="English" key="language"/>
<parameter value="3" key="max_token_length"/>
</operator>
-<operator name="Transform Cases" expanded="true" compatibility="9.3.001" class="text:transform_cases" activated="true" y="238" x="246" width="90" height="68">
<parameter value="lower case" key="transform_to"/>
</operator>
<operator name="Filter Stopwords (English)" expanded="true" compatibility="9.3.001" class="text:filter_stopwords_english" activated="true" y="238" x="380" width="90" height="68"/>
<connect to_port="document" to_op="Extract Content" from_port="document"/>
<connect to_port="document" to_op="Tokenize" from_port="document" from_op="Extract Content"/>
<connect to_port="document" to_op="Transform Cases" from_port="document" from_op="Tokenize"/>
<connect to_port="document" to_op="Filter Stopwords (English)" from_port="document" from_op="Transform Cases"/>
<connect to_port="document 1" from_port="document" from_op="Filter Stopwords (English)"/>
<portSpacing spacing="0" port="source_document"/>
<portSpacing spacing="0" port="sink_document 1"/>
<portSpacing spacing="0" port="sink_document 2"/>
</process>
</operator>
<connect to_port="Example Set" to_op="Get Pages" from_port="output" from_op="Read CSV"/>
<connect to_port="example set" to_op="Process Documents from Data" from_port="Example Set" from_op="Get Pages"/>
<connect to_port="result 1" from_port="word list" from_op="Process Documents from Data"/>
<portSpacing spacing="0" port="source_input 1"/>
<portSpacing spacing="0" port="sink_result 1"/>
<portSpacing spacing="0" port="sink_result 2"/>
</process>
</operator>
</process>
0
Answers
I fixed the process. Please check if this will help you.
You needed to define the name of the attribute that is going to store your html.
If you want to get the wordlist for each of the urls you'll need to grap everything inside a loop values operator. That way you will get a list for each of the urls you visit.
Then you can append everything and write it back to an excel.