Downloading images for OCR
Hi
I am attemting to apply machine learning to determine the colours that individuals prefer. As part of this I need to download brand images. I am attempting to store a library images, one of them as an example:
https://www.vodafone.co.uk/cs/groups/public/documents/webcontent/1287x929_vodafone_logo.jpg
I am using the web crawling extension and trying to save the image on the page:
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="112" y="85">
<list key="attribute_values">
<parameter key="image" value="("https://www.vodafone.co.uk/cs/groups/public/documents/webcontent/1287x929_vodafone_logo.jpg")"/>
</list>
<list key="set_additional_roles"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="multiply" compatibility="8.1.000" expanded="true" height="103" name="Multiply" width="90" x="246" y="85"/>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="extract_macro" compatibility="8.1.000" expanded="true" height="68" name="Extract Macro" width="90" x="380" y="85">
<parameter key="macro" value="image"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="statistics" value="average"/>
<parameter key="attribute_name" value="image"/>
<parameter key="example_index" value="1"/>
<list key="additional_macros"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="loop_examples" compatibility="8.1.000" expanded="true" height="103" name="Loop Examples" width="90" x="380" y="238">
<parameter key="iteration_macro" value="example"/>
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="313" y="289">
<parameter key="url" value="%{image}"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value=".*"/>
</list>
<parameter key="max_crawl_depth" value="1"/>
<parameter key="retrieve_as_html" value="true"/>
<parameter key="enable_basic_auth" value="false"/>
<parameter key="add_content_as_attribute" value="true"/>
<parameter key="write_pages_to_disk" value="true"/>
<parameter key="include_binary_content" value="false"/>
<parameter key="output_dir" value="/Users/robinmeisel/Desktop/images"/>
<parameter key="output_file_extension" value="%{image}.png"/>
<parameter key="max_pages" value="1"/>
<parameter key="max_page_size" value="1000"/>
<parameter key="delay" value="200"/>
<parameter key="max_concurrent_connections" value="100"/>
<parameter key="max_connections_per_host" value="50"/>
<parameter key="user_agent" value="rapidminer-web-mining-extension-crawler"/>
<parameter key="ignore_robot_exclusion" value="true"/>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_port="output 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
</process>
It worked once and then never again. In terms of the above there is only one image in the generate data operator, this would normally referance a database of over a thousand images to download.
What is the best approach to get images down from a web page and then store them on a local folder before proccessing through OCR?
Kind regards
Robin
Best Answer
-
sgenzer Administrator, Moderator, Employee-RapidMiner, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager
Hi @robin ok thanks for that. I now understand. This is rather "quick and dirty" but I hope you may get the idea of how I would approach it. There may be a more clever way.... ?
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
<parameter key="url" value="https://community.rapidminer.com/"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".*"/>
</list>
<parameter key="retrieve_as_html" value="true"/>
<parameter key="output_dir" value="/Users/GenzerConsulting"/>
<parameter key="max_pages" value="5"/>
<parameter key="max_page_size" value="1000000000"/>
</operator>
<operator activated="true" class="loop_examples" compatibility="8.2.000" expanded="true" height="103" name="Loop Examples" width="90" x="179" y="34">
<process expanded="true">
<operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro (2)" width="90" x="45" y="34">
<parameter key="macro" value="URL"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="Link"/>
<parameter key="example_index" value="%{example}"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="179" y="34">
<parameter key="url" value="%{URL}"/>
<parameter key="accept_cookies" value="all"/>
<list key="query_parameters"/>
<list key="request_properties"/>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="313" y="34">
<parameter key="text_attribute" value="text"/>
<parameter key="add_meta_information" value="false"/>
</operator>
<operator activated="true" class="split" compatibility="8.2.000" expanded="true" height="82" name="Split" width="90" x="447" y="34">
<parameter key="split_pattern" value="[<]|[>]"/>
</operator>
<operator activated="true" class="transpose" compatibility="8.2.000" expanded="true" height="82" name="Transpose" width="90" x="581" y="34"/>
<operator activated="true" class="filter_examples" compatibility="8.2.000" expanded="true" height="103" name="Filter Examples" width="90" x="715" y="34">
<list key="filters_list">
<parameter key="filters_entry_key" value="att_1.contains.\.png"/>
<parameter key="filters_entry_key" value="att_1.contains.https"/>
</list>
</operator>
<operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes" width="90" x="849" y="34">
<list key="function_descriptions">
<parameter key="att_1" value="suffix(att_1,length(att_1)-index(att_1,"http"))"/>
</list>
</operator>
<operator activated="false" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="391">
<parameter key="mode" value="regular expression"/>
<parameter key="expression" value="[<]|[>]"/>
</operator>
<operator activated="false" class="text:filter_tokens_by_content" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Content)" width="90" x="179" y="391">
<parameter key="string" value=".png"/>
</operator>
<operator activated="false" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="313" y="391">
<parameter key="max_chars" value="250"/>
</operator>
<operator activated="false" breakpoints="after" class="text:write_document" compatibility="7.5.000" expanded="true" height="82" name="Write Document" width="90" x="447" y="391">
<parameter key="file" value="/Users/GenzerConsulting/foo.txt"/>
</operator>
<operator activated="false" class="text:keep_document_parts" compatibility="7.5.000" expanded="true" height="68" name="Keep Document Parts" width="90" x="112" y="187">
<parameter key="extraction_regex" value="http.*[.]png.*\s"/>
</operator>
<operator activated="false" breakpoints="after" class="web:unescape_html" compatibility="7.3.000" expanded="true" height="68" name="Unescape HTML Document" width="90" x="246" y="391"/>
<operator activated="false" class="web:unescape_html_attribute" compatibility="7.3.000" expanded="true" height="82" name="Unescape HTML" width="90" x="514" y="187"/>
<operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="983" y="34">
<list key="function_descriptions">
<parameter key="att_1" value="prefix(att_1,4+index(att_1,".png"))"/>
</list>
</operator>
<operator activated="true" class="loop_examples" compatibility="8.2.000" expanded="true" height="82" name="Loop Examples (2)" width="90" x="1117" y="34">
<parameter key="iteration_macro" value="example2"/>
<process expanded="true">
<operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro (3)" width="90" x="45" y="34">
<parameter key="macro" value="imageURL"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="att_1"/>
<parameter key="example_index" value="%{example2}"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="handle_exception" compatibility="8.2.000" expanded="true" height="82" name="Handle Exception" width="90" x="179" y="34">
<process expanded="true">
<operator activated="true" class="generate_macro" compatibility="8.2.000" expanded="true" height="68" name="Generate Macro" width="90" x="112" y="34">
<list key="function_descriptions">
<parameter key="imageName" value="replace(suffix(%{imageURL},15),"/","")"/>
</list>
</operator>
<operator activated="true" class="open_file" compatibility="8.2.000" expanded="true" height="68" name="Open File" width="90" x="246" y="34">
<parameter key="resource_type" value="URL"/>
<parameter key="url" value="%{imageURL}"/>
</operator>
<operator activated="true" class="write_file" compatibility="8.2.000" expanded="true" height="68" name="Write File" width="90" x="380" y="34">
<parameter key="filename" value="/Users/GenzerConsulting/%{imageName}"/>
</operator>
<connect from_op="Open File" from_port="file" to_op="Write File" to_port="file"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
<process expanded="true">
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<connect from_port="example set" to_op="Extract Macro (3)" to_port="example set"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
</process>
</operator>
<connect from_port="example set" to_op="Extract Macro (2)" to_port="example set"/>
<connect from_op="Get Page" from_port="output" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Split" to_port="example set input"/>
<connect from_op="Split" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Generate Attributes (2)" to_port="example set input"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
<connect from_op="Filter Tokens (by Content)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Write Document" to_port="document"/>
<connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Loop Examples (2)" to_port="example set"/>
<connect from_op="Loop Examples (2)" from_port="example set" to_port="output 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_op="Loop Examples" to_port="example set"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
</process>
</operator>
</process>Scott
1
Answers
Hallo Robin,
I try to open your XML file inside RM Studio 8.2.000: it does not populate the process window.
Maerkli
@robin yes I think (?) you did "select all" and then copy/paste direct from the design canvas. This unfortunately produces broken XML. Can you please either attach your .rmp file or go to the XML panel and then copy/paste from there?
Scott
Tried pasting inside of Chrome.
Temporary link to file
Put a temporary link to the file.