The Altair Community is migrating to a new platform to provide a better experience for you. In preparation for the migration, the Altair Community is on read-only mode from October 28 - November 6, 2024. Technical support via cases will continue to work as is. For any urgent requests from Students/Faculty members, please submit the form linked here
"ETL - extracting data from text-space delimited file"
sgenzer
Administrator, Moderator, Employee-RapidMiner, RapidMiner Certified Analyst, Community Manager, Member, University Professor, PM Moderator Posts: 2,959 Community Manager
Hi all -
I am trying to extract data from a very old file that is organized by placement of character per line (and uses spaces to fill in gaps), instead of CSV. For example:
1 GENZER SCOTT 76 33 KERWIN HILL ROAD NORWICH
2 JOHNSON JIM 81 4 MAIN ST NEW YORK CITY
3 CLINTON BILL 66 16 APPLEBY AVE LITTLE ROCK
So the 1st attribute (ID) is at the 1st character, the 2nd attribute (LAST NAME) is from the 3rd character to the 17th character, the 3rd attribute (FIRST NAME) is from the 18th character and so on. Is there any elegant way to extract these data? The only way I can think to do this is to multiply the input, rename the attribute, and then join (XML below). Anything better? Each line of this datafile is about 2000 characters long with about 50 attributes. There is some pattern to the attributes but still...
Thanks all!
Scott
<operator activated="true" class="loop_files" compatibility="5.3.015" expanded="true" height="76" name="Loop Files" width="90" x="179" y="75">
<parameter key="directory" value="/Users/foo/>
<parameter key="filter" value="*.dat"/>
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="5.3.015" expanded="true" height="60" name="Read CSV (2)" width="90" x="45" y="30">
<parameter key="csv_file" value="/Users/foo/03105302.DAT"/>
<parameter key="trim_lines" value="true"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations"/>
<list key="data_set_meta_data_information"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.3.015" expanded="true" height="94" name="Multiply (2)" width="90" x="179" y="30"/>
<operator activated="true" class="extract_macro_from_annotation" compatibility="5.3.015" expanded="true" height="60" name="Extract Macro from Annotation (2)" width="90" x="179" y="165">
<parameter key="macro" value="filename"/>
<parameter key="annotation" value="Source"/>
</operator>
<operator activated="true" class="subprocess" compatibility="5.3.015" expanded="true" height="76" name="INFO EXTRACT (2)" width="90" x="313" y="30">
<process expanded="true">
<operator activated="true" class="multiply" compatibility="5.3.015" expanded="true" height="166" name="Multiply (3)" width="90" x="112" y="30"/>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (77)" width="90" x="380" y="30">
<parameter key="first_character_index" value="125"/>
<parameter key="last_character_index" value="125"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (77)" width="90" x="514" y="30"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (73)" width="90" x="648" y="30">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="Gender"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (78)" width="90" x="380" y="120">
<parameter key="first_character_index" value="115"/>
<parameter key="last_character_index" value="115"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (78)" width="90" x="514" y="120"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (74)" width="90" x="648" y="120">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="Middle Initial"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (79)" width="90" x="380" y="210">
<parameter key="first_character_index" value="107"/>
<parameter key="last_character_index" value="114"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (79)" width="90" x="514" y="210"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (75)" width="90" x="648" y="210">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="First Name"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (80)" width="90" x="380" y="300">
<parameter key="first_character_index" value="95"/>
<parameter key="last_character_index" value="106"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (80)" width="90" x="514" y="300"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (76)" width="90" x="648" y="300">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="Last Name"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (81)" width="90" x="380" y="390">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="att1"/>
<parameter key="first_character_index" value="55"/>
<parameter key="last_character_index" value="94"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (81)" width="90" x="514" y="390"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (77)" width="90" x="648" y="390">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="school name"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (82)" width="90" x="380" y="480">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="att1"/>
<parameter key="last_character_index" value="28"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (82)" width="90" x="514" y="480"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (78)" width="90" x="648" y="480">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="initial numerical data"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join (45)" width="90" x="782" y="120">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join (44)" width="90" x="782" y="210">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join (43)" width="90" x="782" y="300">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join (42)" width="90" x="782" y="390">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join (2)" width="90" x="782" y="480">
<list key="key_attributes"/>
</operator>
<connect from_port="in 1" to_op="Multiply (3)" to_port="input"/>
<connect from_op="Multiply (3)" from_port="output 1" to_op="Cut (77)" to_port="example set input"/>
<connect from_op="Multiply (3)" from_port="output 2" to_op="Cut (78)" to_port="example set input"/>
<connect from_op="Multiply (3)" from_port="output 3" to_op="Cut (79)" to_port="example set input"/>
<connect from_op="Multiply (3)" from_port="output 4" to_op="Cut (80)" to_port="example set input"/>
<connect from_op="Multiply (3)" from_port="output 5" to_op="Cut (81)" to_port="example set input"/>
<connect from_op="Multiply (3)" from_port="output 6" to_op="Cut (82)" to_port="example set input"/>
<connect from_op="Cut (77)" from_port="example set output" to_op="Generate ID (77)" to_port="example set input"/>
<connect from_op="Generate ID (77)" from_port="example set output" to_op="Rename (73)" to_port="example set input"/>
<connect from_op="Rename (73)" from_port="example set output" to_op="Join (45)" to_port="left"/>
<connect from_op="Cut (78)" from_port="example set output" to_op="Generate ID (78)" to_port="example set input"/>
<connect from_op="Generate ID (78)" from_port="example set output" to_op="Rename (74)" to_port="example set input"/>
<connect from_op="Rename (74)" from_port="example set output" to_op="Join (45)" to_port="right"/>
<connect from_op="Cut (79)" from_port="example set output" to_op="Generate ID (79)" to_port="example set input"/>
<connect from_op="Generate ID (79)" from_port="example set output" to_op="Rename (75)" to_port="example set input"/>
<connect from_op="Rename (75)" from_port="example set output" to_op="Join (44)" to_port="right"/>
<connect from_op="Cut (80)" from_port="example set output" to_op="Generate ID (80)" to_port="example set input"/>
<connect from_op="Generate ID (80)" from_port="example set output" to_op="Rename (76)" to_port="example set input"/>
<connect from_op="Rename (76)" from_port="example set output" to_op="Join (43)" to_port="right"/>
<connect from_op="Cut (81)" from_port="example set output" to_op="Generate ID (81)" to_port="example set input"/>
<connect from_op="Generate ID (81)" from_port="example set output" to_op="Rename (77)" to_port="example set input"/>
<connect from_op="Rename (77)" from_port="example set output" to_op="Join (42)" to_port="right"/>
<connect from_op="Cut (82)" from_port="example set output" to_op="Generate ID (82)" to_port="example set input"/>
<connect from_op="Generate ID (82)" from_port="example set output" to_op="Rename (78)" to_port="example set input"/>
<connect from_op="Rename (78)" from_port="example set output" to_op="Join (2)" to_port="right"/>
<connect from_op="Join (45)" from_port="join" to_op="Join (44)" to_port="left"/>
<connect from_op="Join (44)" from_port="join" to_op="Join (43)" to_port="left"/>
<connect from_op="Join (43)" from_port="join" to_op="Join (42)" to_port="left"/>
<connect from_op="Join (42)" from_port="join" to_op="Join (2)" to_port="left"/>
<connect from_op="Join (2)" from_port="join" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
I am trying to extract data from a very old file that is organized by placement of character per line (and uses spaces to fill in gaps), instead of CSV. For example:
1 GENZER SCOTT 76 33 KERWIN HILL ROAD NORWICH
2 JOHNSON JIM 81 4 MAIN ST NEW YORK CITY
3 CLINTON BILL 66 16 APPLEBY AVE LITTLE ROCK
So the 1st attribute (ID) is at the 1st character, the 2nd attribute (LAST NAME) is from the 3rd character to the 17th character, the 3rd attribute (FIRST NAME) is from the 18th character and so on. Is there any elegant way to extract these data? The only way I can think to do this is to multiply the input, rename the attribute, and then join (XML below). Anything better? Each line of this datafile is about 2000 characters long with about 50 attributes. There is some pattern to the attributes but still...
Thanks all!
Scott
<operator activated="true" class="loop_files" compatibility="5.3.015" expanded="true" height="76" name="Loop Files" width="90" x="179" y="75">
<parameter key="directory" value="/Users/foo/>
<parameter key="filter" value="*.dat"/>
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="5.3.015" expanded="true" height="60" name="Read CSV (2)" width="90" x="45" y="30">
<parameter key="csv_file" value="/Users/foo/03105302.DAT"/>
<parameter key="trim_lines" value="true"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations"/>
<list key="data_set_meta_data_information"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.3.015" expanded="true" height="94" name="Multiply (2)" width="90" x="179" y="30"/>
<operator activated="true" class="extract_macro_from_annotation" compatibility="5.3.015" expanded="true" height="60" name="Extract Macro from Annotation (2)" width="90" x="179" y="165">
<parameter key="macro" value="filename"/>
<parameter key="annotation" value="Source"/>
</operator>
<operator activated="true" class="subprocess" compatibility="5.3.015" expanded="true" height="76" name="INFO EXTRACT (2)" width="90" x="313" y="30">
<process expanded="true">
<operator activated="true" class="multiply" compatibility="5.3.015" expanded="true" height="166" name="Multiply (3)" width="90" x="112" y="30"/>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (77)" width="90" x="380" y="30">
<parameter key="first_character_index" value="125"/>
<parameter key="last_character_index" value="125"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (77)" width="90" x="514" y="30"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (73)" width="90" x="648" y="30">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="Gender"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (78)" width="90" x="380" y="120">
<parameter key="first_character_index" value="115"/>
<parameter key="last_character_index" value="115"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (78)" width="90" x="514" y="120"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (74)" width="90" x="648" y="120">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="Middle Initial"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (79)" width="90" x="380" y="210">
<parameter key="first_character_index" value="107"/>
<parameter key="last_character_index" value="114"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (79)" width="90" x="514" y="210"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (75)" width="90" x="648" y="210">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="First Name"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (80)" width="90" x="380" y="300">
<parameter key="first_character_index" value="95"/>
<parameter key="last_character_index" value="106"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (80)" width="90" x="514" y="300"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (76)" width="90" x="648" y="300">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="Last Name"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (81)" width="90" x="380" y="390">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="att1"/>
<parameter key="first_character_index" value="55"/>
<parameter key="last_character_index" value="94"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (81)" width="90" x="514" y="390"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (77)" width="90" x="648" y="390">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="school name"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="cut" compatibility="5.3.015" expanded="true" height="76" name="Cut (82)" width="90" x="380" y="480">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="att1"/>
<parameter key="last_character_index" value="28"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (82)" width="90" x="514" y="480"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (78)" width="90" x="648" y="480">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="initial numerical data"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join (45)" width="90" x="782" y="120">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join (44)" width="90" x="782" y="210">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join (43)" width="90" x="782" y="300">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join (42)" width="90" x="782" y="390">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join (2)" width="90" x="782" y="480">
<list key="key_attributes"/>
</operator>
<connect from_port="in 1" to_op="Multiply (3)" to_port="input"/>
<connect from_op="Multiply (3)" from_port="output 1" to_op="Cut (77)" to_port="example set input"/>
<connect from_op="Multiply (3)" from_port="output 2" to_op="Cut (78)" to_port="example set input"/>
<connect from_op="Multiply (3)" from_port="output 3" to_op="Cut (79)" to_port="example set input"/>
<connect from_op="Multiply (3)" from_port="output 4" to_op="Cut (80)" to_port="example set input"/>
<connect from_op="Multiply (3)" from_port="output 5" to_op="Cut (81)" to_port="example set input"/>
<connect from_op="Multiply (3)" from_port="output 6" to_op="Cut (82)" to_port="example set input"/>
<connect from_op="Cut (77)" from_port="example set output" to_op="Generate ID (77)" to_port="example set input"/>
<connect from_op="Generate ID (77)" from_port="example set output" to_op="Rename (73)" to_port="example set input"/>
<connect from_op="Rename (73)" from_port="example set output" to_op="Join (45)" to_port="left"/>
<connect from_op="Cut (78)" from_port="example set output" to_op="Generate ID (78)" to_port="example set input"/>
<connect from_op="Generate ID (78)" from_port="example set output" to_op="Rename (74)" to_port="example set input"/>
<connect from_op="Rename (74)" from_port="example set output" to_op="Join (45)" to_port="right"/>
<connect from_op="Cut (79)" from_port="example set output" to_op="Generate ID (79)" to_port="example set input"/>
<connect from_op="Generate ID (79)" from_port="example set output" to_op="Rename (75)" to_port="example set input"/>
<connect from_op="Rename (75)" from_port="example set output" to_op="Join (44)" to_port="right"/>
<connect from_op="Cut (80)" from_port="example set output" to_op="Generate ID (80)" to_port="example set input"/>
<connect from_op="Generate ID (80)" from_port="example set output" to_op="Rename (76)" to_port="example set input"/>
<connect from_op="Rename (76)" from_port="example set output" to_op="Join (43)" to_port="right"/>
<connect from_op="Cut (81)" from_port="example set output" to_op="Generate ID (81)" to_port="example set input"/>
<connect from_op="Generate ID (81)" from_port="example set output" to_op="Rename (77)" to_port="example set input"/>
<connect from_op="Rename (77)" from_port="example set output" to_op="Join (42)" to_port="right"/>
<connect from_op="Cut (82)" from_port="example set output" to_op="Generate ID (82)" to_port="example set input"/>
<connect from_op="Generate ID (82)" from_port="example set output" to_op="Rename (78)" to_port="example set input"/>
<connect from_op="Rename (78)" from_port="example set output" to_op="Join (2)" to_port="right"/>
<connect from_op="Join (45)" from_port="join" to_op="Join (44)" to_port="left"/>
<connect from_op="Join (44)" from_port="join" to_op="Join (43)" to_port="left"/>
<connect from_op="Join (43)" from_port="join" to_op="Join (42)" to_port="left"/>
<connect from_op="Join (42)" from_port="join" to_op="Join (2)" to_port="left"/>
<connect from_op="Join (2)" from_port="join" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
Tagged:
0
Answers
I read in your file as csv with a regex as delimiter if it is one or more white spaces or if it is 2 or more whitespaces (etc.).
This works quite well. Alternativly you can read the file and use a regex later on
Dortmund, Germany
Unfortunately that does not really solve much for me...the files often "max out" the space allocations such that one field abuts another:
1 GENZER SCOTT 76 33 KERWIN HILL ROAD NORWICH
2 JOHNSON JIM 81 4 MAIN ST NEW YORK CITY
3 CLINTON BILL 66 16 APPLEBY AVE LITTLE ROCK
...
44HERNANDEZ-JIMEJOSE-EDUARDO51 12 MAIN ST NEW YORK CITY
I also have numeric fields that abut like this:
128866673
where it should be
12,88,666,73 (four separate attributes)
more ideas?
Thanks again!
Scott
The "Rename" operator in combination with the "Split" operator is your friend in this case.
Dortmund, Germany
Dortmund, Germany