The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
[SOLVED]Sentiment Analysis using SVM Linear - Speed Problem
Hello,
I have been working on developing a sentiment classifier using Linear SVM in RapidMiner.
I use the data found at http://snap.stanford.edu/data/web-Amazon.html as my training set.
The dataset I use is very large - about 78,000 positive and negative reviews.
When I run this process,it has taken over 13 hours and still continuing!(in the Validation stage currently)!
So,my query is whether this duration is acceptable or is there any way to optimise the classifier? What is the estimated time for datasets of such size?
Here is the XML code:
Thanks,
Keshav
I have been working on developing a sentiment classifier using Linear SVM in RapidMiner.
I use the data found at http://snap.stanford.edu/data/web-Amazon.html as my training set.
The dataset I use is very large - about 78,000 positive and negative reviews.
When I run this process,it has taken over 13 hours and still continuing!(in the Validation stage currently)!
So,my query is whether this duration is acceptable or is there any way to optimise the classifier? What is the estimated time for datasets of such size?
Here is the XML code:
Hoping to find a solution to this problem.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
<list key="text_directories">
<parameter key="negative" value="C:\Users\Keshav\Documents\Academics\Project\train\neg"/>
<parameter key="positive" value="C:\Users\Keshav\Documents\Academics\Project\train\pos"/>
</list>
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_above_percent" value="95.0"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="112" y="75"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="216" y="162"/>
<operator activated="true" class="text:stem_porter" compatibility="5.3.002" expanded="true" height="60" name="Stem (Porter)" width="90" x="292" y="247"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="421" y="338"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="x_validation" compatibility="5.3.013" expanded="true" height="112" name="Validation" width="90" x="313" y="165">
<process expanded="true">
<operator activated="true" class="support_vector_machine_linear" compatibility="5.3.013" expanded="true" height="76" name="SVM (Linear)" width="90" x="112" y="30"/>
<connect from_port="training" to_op="SVM (Linear)" to_port="training set"/>
<connect from_op="SVM (Linear)" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.013" expanded="true" height="76" name="Apply Model" width="90" x="51" y="56">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.013" expanded="true" height="76" name="Performance" width="90" x="99" y="165"/>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="store" compatibility="5.3.013" expanded="true" height="60" name="Store (2)" width="90" x="380" y="30">
<parameter key="repository_entry" value="//Local Repository/data2"/>
</operator>
<operator activated="true" class="store" compatibility="5.3.013" expanded="true" height="60" name="Store" width="90" x="45" y="210">
<parameter key="repository_entry" value="//Local Repository/data1"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Validation" to_port="training"/>
<connect from_op="Process Documents from Files" from_port="word list" to_op="Store" to_port="input"/>
<connect from_op="Validation" from_port="model" to_op="Store (2)" to_port="input"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
<connect from_op="Store (2)" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Thanks,
Keshav
0
Answers
You have to to do experiments varying data and parameters.
Here are some remarks:
1) Use "Stem"-Operator at the end
2) Reduce number of validations from 10 to e.g. 3
3 ) Reduce number of data sets by inserting "Sample"-Operator after
"Process Documents From File" Operator.
4) Use a less computational demanding algorithm first, e.g. Naive Bayes
5) And finally: Dont forget to optimize the parameter C in the SVM algorithm.