The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
Operator Crawl: Process failed
Hi,
I have installed the lastest version 10.1.001 and I have a problem with the operator Crwal.
The process fail and here the error message.
I have checked the version of Java and the version is 1.8.0_361
Thanks
I have installed the lastest version 10.1.001 and I have a problem with the operator Crwal.
The process fail and here the error message.
I have checked the version of Java and the version is 1.8.0_361
- Exception: java.lang.NoClassDefFoundError
- Message: org/apache/tika/parser/html/HtmlParser
- Stack trace:
- edu.uci.ics.crawler4j.parser.TikaHtmlParser.(TikaHtmlParser.java:34)
- edu.uci.ics.crawler4j.parser.Parser.(Parser.java:42)
- edu.uci.ics.crawler4j.crawler.CrawlController.(CrawlController.java:85)
- com.rapidminer.operator.web.crawler.CrawlerOperator.doWork(CrawlerOperator.java:269)
- com.rapidminer.operator.Operator.execute(Operator.java:1024)
- com.rapidminer.operator.execution.SimpleUnitExecutor.execute(SimpleUnitExecutor.java:77)
- com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:804)
- com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:799)
- java.base/java.security.AccessController.doPrivileged(Native Method)
- com.rapidminer.operator.ExecutionUnit.execute(ExecutionUnit.java:799)
- com.rapidminer.operator.OperatorChain.doWork(OperatorChain.java:423)
- com.rapidminer.operator.Operator.execute(Operator.java:1024)
- com.rapidminer.Process.executeRoot(Process.java:1476)
- com.rapidminer.Process.lambda$executeRootInPool$5(Process.java:1452)
- com.rapidminer.studio.concurrency.internal.AbstractConcurrencyContext$AdaptedCallable.exec(AbstractConcurrencyContext.java:362)
- java.base/java.util.concurrent.ForkJoinTask.doExec(Unknown Source)
- java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(Unknown Source)
- java.base/java.util.concurrent.ForkJoinPool.scan(Unknown Source)
- java.base/java.util.concurrent.ForkJoinPool.runWorker(Unknown Source)
- java.base/java.util.concurrent.ForkJoinWorkerThread.run(Unknown Source)
Thanks
Tagged:
5
Answers
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="10.2.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="UTF-8"/>
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="10.0.000" expanded="true" height="68" name="Crawl Web" width="90" x="179" y="34">
<parameter key="url" value="https://osa.fh-potsdam.de"/>
<list key="crawling_rules"/>
<parameter key="max_crawl_depth" value="1"/>
<parameter key="retrieve_as_html" value="true"/>
<parameter key="enable_basic_auth" value="false"/>
<parameter key="add_content_as_attribute" value="false"/>
<parameter key="write_pages_to_disk" value="true"/>
<parameter key="include_binary_content" value="true"/>
<parameter key="output_dir" value="E:/_tmp"/>
<parameter key="output_file_extension" value="html"/>
<parameter key="max_pages" value="10"/>
<parameter key="max_page_size" value="1000"/>
<parameter key="delay" value="200"/>
<parameter key="max_concurrent_connections" value="100"/>
<parameter key="max_connections_per_host" value="50"/>
<parameter key="user_agent" value="rapidminer-web-mining-extension-crawler"/>
<parameter key="ignore_robot_exclusion" value="false"/>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>