The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
LOF on Text Data
Hello Team,
I am fairly new to RM and currently conducting some research on online text.
In particular I am trying to detect outliers from an set of documents by using the LOF operator.
Now I have some troubles, since the LOF for each document is very close to 1, no matter how I set the MinPtsUB and MinPtsLB.
Basically I have represented the each document as vector of term frequency and TF-IDF, before applying the LOF operator.
So I have two ExampleSets representing the corpus as, a matrix of TF values and a matrix of TF-IDF values, to check the differences.
However, for both matrices I get LOF values that are equal or very close to one, which does not make any sence to me.
Could you tell me, if and what I am doing wrong?
Best
Please find my XML enclosed:
I am fairly new to RM and currently conducting some research on online text.
In particular I am trying to detect outliers from an set of documents by using the LOF operator.
Now I have some troubles, since the LOF for each document is very close to 1, no matter how I set the MinPtsUB and MinPtsLB.
Basically I have represented the each document as vector of term frequency and TF-IDF, before applying the LOF operator.
So I have two ExampleSets representing the corpus as, a matrix of TF values and a matrix of TF-IDF values, to check the differences.
However, for both matrices I get LOF values that are equal or very close to one, which does not make any sence to me.
Could you tell me, if and what I am doing wrong?
Best
Please find my XML enclosed:
<?xml
version="1.0" encoding="UTF-8" ?>
- <process
version="9.2.000">
- <context>
<input />
<output />
<macros />
</context>
- <operator
activated="true" class="process" compatibility="9.2.000"
expanded="true" name="Process">
<parameter key="logverbosity"
value="init" />
<parameter key="random_seed"
value="2001" />
<parameter key="send_mail"
value="never" />
<parameter key="notification_email"
value=""
/>
<parameter key="process_duration_for_mail" value="30" />
<parameter key="encoding"
value="SYSTEM" />
- <process
expanded="true">
- <operator
activated="true" class="retrieve" compatibility="9.2.000"
expanded="true" height="68" name="Retrieve
PreppedTestData" width="90" x="112" y="34">
<parameter key="repository_entry"
value="../Data/PreppedDatabase_TF" />
</operator>
- <operator
activated="true" class="select_attributes" compatibility="9.2.000"
expanded="true" height="82" name="Select
Attributes" width="90" x="246" y="34">
<parameter key="attribute_filter_type" value="value_type" />
<parameter key="attribute"
value=""
/>
<parameter key="attributes"
value="Date" />
<parameter key="use_except_expression" value="false" />
<parameter key="value_type"
value="real" />
<parameter key="use_value_type_exception" value="false" />
<parameter key="except_value_type"
value="time" />
<parameter key="block_type"
value="attribute_block" />
<parameter key="use_block_type_exception" value="false" />
<parameter key="except_block_type"
value="value_matrix_row_start" />
<parameter key="invert_selection"
value="false" />
<parameter key="include_special_attributes" value="true" />
</operator>
- <operator
activated="true" class="detect_outlier_lof" compatibility="9.2.000"
expanded="true" height="82" name="Detect
Outlier (LOF)" width="90" x="447" y="34">
<parameter key="minimal_points_lower_bound" value="1" />
<parameter key="minimal_points_upper_bound" value="3" />
<parameter key="distance_function"
value="euclidian distance" />
</operator>
- <operator
activated="false" class="anomalydetection:Local Outlier
Factor (LOF)" compatibility="2.4.001"
expanded="true" height="103" name="Local
Outlier Factor (LOF)" width="90" x="380" y="340">
<parameter key="k_min (MinPtsLB)"
value="1"
/>
<parameter key="k_max (MinPtsUB)"
value="10"
/>
<parameter key="measure_types"
value="MixedMeasures" />
<parameter key="mixed_measure"
value="MixedEuclideanDistance" />
<parameter key="nominal_measure"
value="NominalDistance" />
<parameter key="numerical_measure"
value="EuclideanDistance" />
<parameter key="divergence"
value="GeneralizedIDivergence" />
<parameter key="kernel_type"
value="radial" />
<parameter key="kernel_gamma"
value="1.0" />
<parameter key="kernel_sigma1"
value="1.0" />
<parameter key="kernel_sigma2"
value="0.0" />
<parameter key="kernel_sigma3"
value="2.0" />
<parameter key="kernel_degree"
value="3.0" />
<parameter key="kernel_shift"
value="1.0" />
<parameter key="kernel_a"
value="1.0" />
<parameter key="kernel_b"
value="0.0" />
<parameter key="parallelize evaluation process" value="false" />
<parameter key="number of threads"
value="4"
/>
</operator>
- <operator
activated="true" class="store" compatibility="9.2.000"
expanded="true" height="68" name="Store"
width="90"
x="648"
y="34">
<parameter key="repository_entry"
value="../Results/LOF_TF" />
</operator>
- <operator
activated="false" class="write_excel" compatibility="9.2.000"
expanded="true" height="82" name="Write
Excel" width="90" x="581" y="442">
<parameter key="excel_file"
value="\\ads.dlh.de\lhuser$\LHT\HAM99\U801591\Documents\000_Masterarbeit\05_Praxis\04_LOF
- Outlier Detection\042_Results\LOF_TF.xlsx" />
<parameter key="file_format"
value="xlsx" />
<parameter key="encoding"
value="SYSTEM" />
<parameter key="sheet_name"
value="LOF_TF" />
<parameter key="date_format"
value="yyyy-MM-dd HH:mm:ss" />
<parameter key="number_format"
value="#.0" />
</operator>
- <operator
activated="true" class="retrieve" compatibility="9.2.000"
expanded="true" height="68" name="Retrieve
PreppedTestData (2)" width="90" x="112" y="187">
<parameter key="repository_entry"
value="../Data/PreppedDatabase_TF-IDF" />
</operator>
- <operator
activated="true" class="select_attributes" compatibility="9.2.000"
expanded="true" height="82" name="Select
Attributes (2)" width="90" x="246" y="187">
<parameter key="attribute_filter_type" value="value_type" />
<parameter key="attribute"
value=""
/>
<parameter key="attributes"
value="Date" />
<parameter key="use_except_expression" value="false" />
<parameter key="value_type"
value="real" />
<parameter key="use_value_type_exception" value="false" />
<parameter key="except_value_type"
value="time" />
<parameter key="block_type"
value="attribute_block" />
<parameter key="use_block_type_exception" value="false" />
<parameter key="except_block_type"
value="value_matrix_row_start" />
<parameter key="invert_selection"
value="false" />
<parameter key="include_special_attributes" value="true" />
</operator>
- <operator
activated="true" class="detect_outlier_lof" compatibility="9.2.000"
expanded="true" height="82" name="Detect
Outlier (2)" width="90" x="447" y="187">
<parameter key="minimal_points_lower_bound" value="1" />
<parameter key="minimal_points_upper_bound" value="3" />
<parameter key="distance_function"
value="euclidian distance" />
</operator>
- <operator
activated="false" class="anomalydetection:Local Outlier
Factor (LOF)" compatibility="2.4.001"
expanded="true" height="103" name="Local
Outlier Factor (2)" width="90" x="380" y="493">
<parameter key="k_min (MinPtsLB)"
value="1"
/>
<parameter key="k_max (MinPtsUB)"
value="10"
/>
<parameter key="measure_types"
value="MixedMeasures" />
<parameter key="mixed_measure"
value="MixedEuclideanDistance" />
<parameter key="nominal_measure"
value="NominalDistance" />
<parameter key="numerical_measure"
value="EuclideanDistance" />
<parameter key="divergence"
value="GeneralizedIDivergence" />
<parameter key="kernel_type"
value="radial" />
<parameter key="kernel_gamma"
value="1.0" />
<parameter key="kernel_sigma1"
value="1.0" />
<parameter key="kernel_sigma2"
value="0.0" />
<parameter key="kernel_sigma3"
value="2.0" />
<parameter key="kernel_degree"
value="3.0" />
<parameter key="kernel_shift"
value="1.0" />
<parameter key="kernel_a"
value="1.0" />
<parameter key="kernel_b"
value="0.0" />
<parameter key="parallelize evaluation process" value="false" />
<parameter key="number of threads"
value="4"
/>
</operator>
- <operator
activated="true" class="store" compatibility="9.2.000"
expanded="true" height="68" name="Store
(2)" width="90" x="648" y="187">
<parameter key="repository_entry"
value="../Results/LOF_TF-IDF" />
</operator>
- <operator
activated="false" class="write_excel" compatibility="9.2.000"
expanded="true" height="82" name="Write Excel
(2)" width="90" x="648" y="595">
<parameter key="excel_file"
value="\\ads.dlh.de\lhuser$\LHT\HAM99\U801591\Documents\000_Masterarbeit\05_Praxis\04_LOF
- Outlier Detection\042_Results\LOF_TF-IDF.xlsx" />
<parameter key="file_format"
value="xlsx" />
<parameter key="encoding"
value="SYSTEM" />
<parameter key="sheet_name"
value="LOF_TF-IDF" />
<parameter key="date_format"
value="yyyy-MM-dd HH:mm:ss" />
<parameter key="number_format"
value="#.0" />
</operator>
<connect from_op="Retrieve
PreppedTestData" from_port="output"
to_op="Select Attributes" to_port="example
set input" />
<connect from_op="Select Attributes" from_port="example
set output" to_op="Detect Outlier (LOF)"
to_port="example set input" />
<connect from_op="Detect Outlier (LOF)" from_port="example
set output" to_op="Store"
to_port="input" />
<connect from_op="Store" from_port="through"
to_port="result 1" />
<connect from_op="Retrieve PreppedTestData
(2)" from_port="output"
to_op="Select Attributes (2)" to_port="example
set input" />
<connect from_op="Select Attributes (2)" from_port="example
set output" to_op="Detect Outlier (2)"
to_port="example set input" />
<connect from_op="Detect Outlier (2)" from_port="example
set output" to_op="Store (2)"
to_port="input" />
<connect from_op="Store (2)" from_port="through"
to_port="result 2" />
<portSpacing port="source_input 1" spacing="0" />
<portSpacing port="sink_result 1" spacing="0" />
<portSpacing port="sink_result 2" spacing="0" />
<portSpacing port="sink_result 3" spacing="0" />
</process>
</operator>
</process>
Tagged:
0
Best Answer
-
tamberge Member Posts: 6 Contributor IISo I have been trying different methods in all possible combinations for a test set of 26 examples:changing MinPts UB and LB, (1-2, 2-3, 5-10)choosing different vectors (TF,TF-IDC, Term Occurence, Binary Term Occurence),pruning (filtering frequent words, and filtering unfrequent words)However, I was not able to get values that are LOF >> 1.So does anyone have a theory, where this is coming from?I can also share the data, if you want.0
Answers
Or you could switch to a different outlier detection algorithm that is more inherently distance based like k-nn anomaly score rather than density based, although you may still run into similar problems.
Lindon Ventures
Data Science Consulting from Certified RapidMiner Experts
I will let you know, if it has any positive impact on the outcome!
Thanks again!