The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
"Problem with FeatureExtraction (VISTA, Java 1.6.16)"
Could somebody explain me how to use FeatureExtraction, because it does not work for me...
Here is my code:
<operator name="FeatureExtraction" class="FeatureExtraction" breakpoints="within,after">
<list key="texts">
<parameter key="tmp_dir" value="c:\tmp\tmp%{file_name}"/>
</list>
<parameter key="default_content_type" value="text/html"/>
<parameter key="default_content_encoding" value="UTF-8"/>
<parameter key="default_content_language" value="pl"/>
<parameter key="use_content_attributes" value="true"/>
<parameter key="id_attribute_type" value="number"/>
<list key="attributes">
<parameter key="cala_strona" value="//h:div/h:div/text()"/>
<parameter key="prezentacja" value="//h:div/text()"/>
<parameter key="all" value="".*""/>
<parameter key="root_www" value="/h:html/text()"/>
<parameter key="result_text" value="//h:div[class="result_txt"]"/>
<parameter key="result" value="/h:html/h:body/h:div/h:div[2]/h:div[3]/h:div/h:div[3]/h:div/h:ul/hi:li[2]/h:div[2]/text()"/>
</list>
<list key="namespaces">
</list>
</operator>
I am using it in my plan, which extracts data for analysis from HTML page using XPATH:
Here is my code:
<operator name="FeatureExtraction" class="FeatureExtraction" breakpoints="within,after">
<list key="texts">
<parameter key="tmp_dir" value="c:\tmp\tmp%{file_name}"/>
</list>
<parameter key="default_content_type" value="text/html"/>
<parameter key="default_content_encoding" value="UTF-8"/>
<parameter key="default_content_language" value="pl"/>
<parameter key="use_content_attributes" value="true"/>
<parameter key="id_attribute_type" value="number"/>
<list key="attributes">
<parameter key="cala_strona" value="//h:div/h:div/text()"/>
<parameter key="prezentacja" value="//h:div/text()"/>
<parameter key="all" value="".*""/>
<parameter key="root_www" value="/h:html/text()"/>
<parameter key="result_text" value="//h:div[class="result_txt"]"/>
<parameter key="result" value="/h:html/h:body/h:div/h:div[2]/h:div[3]/h:div/h:div[3]/h:div/h:ul/hi:li[2]/h:div[2]/text()"/>
</list>
<list key="namespaces">
</list>
</operator>
I am using it in my plan, which extracts data for analysis from HTML page using XPATH:
Tagged:
0
Answers
<?xml version="1.0" encoding="windows-1252"?>
<process version="4.5">
<operator name="Root" class="Process" expanded="yes">
<parameter key="logverbosity" value="status"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<operator name="DirsIter" class="FileIterator" expanded="yes">
<parameter key="directory" value="C:\input\htmls\"/>
<parameter key="file_name_macro" value="file_name"/>
<parameter key="file_path_macro" value="file_path"/>
<parameter key="parent_path_macro" value="parent_path"/>
<parameter key="recursive" value="false"/>
<parameter key="iterate_over_files" value="false"/>
<parameter key="iterate_over_subdirs" value="true"/>
<operator name="FileIterator" class="FileIterator" expanded="yes">
<parameter key="directory" value="%{file_path}"/>
<parameter key="file_name_macro" value="file_name"/>
<parameter key="file_path_macro" value="file_path"/>
<parameter key="parent_path_macro" value="parent_path"/>
<parameter key="recursive" value="false"/>
<parameter key="iterate_over_files" value="true"/>
<parameter key="iterate_over_subdirs" value="false"/>
<operator name="Cleaning" class="OperatorChain" expanded="yes">
<operator name="TextObjectLoader" class="TextObjectLoader">
<parameter key="file" value="%{file_path}"/>
</operator>
<operator name="TextSegmenter" class="TextSegmenter" expanded="yes">
<parameter key="start_regex" value="<!DOCTYPE html PUBLIC [^>]*>"/>
<parameter key="end_regex" value="</html>"/>
<operator name="TextCleaner" class="TextCleaner">
<parameter key="deletion_regex" value="<!DOCTYPE html PUBLIC [^>]*>"/>
</operator>
<operator name="TextObject2ExampleSet" class="TextObject2ExampleSet">
<parameter key="keep_text_object" value="true"/>
<parameter key="text_attribute" value="my_doc_text"/>
<parameter key="label_attribute" value="my_doc_label"/>
</operator>
<operator name="ValueIterator" class="ValueIterator" expanded="yes">
<parameter key="attribute" value="my_doc_text"/>
<parameter key="iteration_macro" value="loop_value"/>
<operator name="SingleTextObjectInput" class="SingleTextObjectInput">
<parameter key="text" value="<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "file:///C:/workspace-rapidminer/xhtml1-transitional.dtd" >%{loop_value}"/>
</operator>
</operator>
<operator name="TextObjectWriter" class="TextObjectWriter">
<parameter key="file" value="c:\tmp\tmp%{file_name}\%{file_name}"/>
<parameter key="overwrite" value="true"/>
</operator>
<operator name="IOConsumer ExampleSet" class="IOConsumer">
<parameter key="io_object" value="ExampleSet"/>
<parameter key="deletion_type" value="delete_all"/>
<parameter key="delete_which" value="1"/>
<parameter key="except" value="1"/>
</operator>
<operator name="IOConsumer TextObject" class="IOConsumer">
<parameter key="io_object" value="TextObject"/>
<parameter key="deletion_type" value="delete_all"/>
<parameter key="delete_which" value="1"/>
<parameter key="except" value="1"/>
</operator>
<operator name="FeatureExtraction" class="FeatureExtraction" breakpoints="within,after">
<list key="texts">
<parameter key="tmp_dir" value="c:\tmp\tmp%{file_name}"/>
</list>
<parameter key="default_content_type" value="text/html"/>
<parameter key="default_content_encoding" value="UTF-8"/>
<parameter key="default_content_language" value="pl"/>
<parameter key="use_content_attributes" value="true"/>
<parameter key="id_attribute_type" value="number"/>
<list key="attributes">
<parameter key="a_data" value="//h:div/h:div/text()"/>
<parameter key="text_in_divs" value="//h:div/text()"/>
<parameter key="all" value="".*""/>
<parameter key="root_www" value="/h:html/text()"/>
<parameter key="result_text" value="//h:div[class="result_txt"]"/>
<parameter key="result" value="/h:html/h:body/h:div/h:div[2]/h:div[3]/h:div/h:div[3]/h:div/h:ul/hi:li[2]/h:div[2]/text()"/>
</list>
<list key="namespaces">
</list>
</operator>
<operator name="IOObjectWriter" class="IOObjectWriter">
<parameter key="object_file" value="c:\tmp\result%{file_name}\%{file_name}_res"/>
<parameter key="io_object" value="ExampleSet"/>
<parameter key="write_which" value="1"/>
<parameter key="output_type" value="XML"/>
<parameter key="continue_on_error" value="true"/>
</operator>
</operator>
</operator>
<operator name="Extraction" class="OperatorChain" expanded="yes">
</operator>
</operator>
</operator>
</operator>
</process>
what's the error message? Perhaps your XPath expression is wrong? Did you take a look into the Operator description?
Greetings,
Sebastian
<FeatureExtractor>
...
<list key="namespaces">
<parameter key="x" value="http://www.w3.org/1999/xhtml"/>
</list>
...
</FeatureExtractor>
and...
<parameter key="abc" value="//x:div/x:div/text()"/>
works ;D