Skip to content
Advertisement

Merge XML files and delete duplicate rows

I’m using the following script to merge XML files. There are 5 different XML file types, ItemAvailability.xml, ItemUpsert.xml, ItemCatDesc.xml, ItemPrice.xml and ItemDelete.xml. How can I have the same script remove duplicate rows from the combined XML files?

I really only need the duplicates removed from the ItemAvailability.xml files, the script is run on a 15-minute interval cron job. Any ideas would be helpful.

Using @Sahil M edited script works when I run only the ItemAvailability.xml, but when I run all the XML files together I get the following error:

Traceback (most recent call last):
  File "/var/www/script/mergeXML.py", line 24, in <module>
    run(sys.argv[1:])
  File "/var/www/script/mergeXML.py", line 17, in run
    if ( (row[0].text, row[1].text) ) in withoutDuplicates:
  File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 266, in __getitem__
    return self._children[index]
IndexError: list index out of range

@Sahil M updated script

#!/usr/bin/env python
import sys
from xml.etree import ElementTree

def run(files):
    first = None
    for filename in files:
        data = ElementTree.parse(filename).getroot()
        if first is None:
            first = data
        else:
            first.extend(data)
    # Creating without duplicates here, which contains the unique list of elements determined by values of subelements
    withoutDuplicates = []
    if first is not None:
        for row in first.findall('row'):
            if ( (row[0].text, row[1].text) ) in withoutDuplicates:
                first.remove(row) # Removing duplicate row
            else:
                withoutDuplicates.append((row[0].text, row[1].text))
        print ElementTree.tostring(first)

if __name__ == "__main__":
    run(sys.argv[1:])

Original Script

#!/usr/bin/env python
import sys
from xml.etree import ElementTree

def run(files):
    first = None
    for filename in files:
        data = ElementTree.parse(filename).getroot()
        if first is None:
            first = data
        else:
            first.extend(data)
    if first is not None:
        print ElementTree.tostring(first)

if __name__ == "__main__":
    run(sys.argv[1:])

To combine files, run:

python xmlcombine.py ?.xml > combined.xml

Script source: Merge multiple XML files from command line

Sample XML files.

ItemAvailability.xml

<FF_IT_AP_P_S table="XCHLIB.ITAP00000">
    <row>
        <APITEM type="1" typename="char">AK-S30         </APITEM>
        <APQAVL type="2" typename="numeric">20</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90500H         </APITEM>
        <APQAVL type="2" typename="numeric">2159</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">AP60218        </APITEM>
        <APQAVL type="2" typename="numeric">68</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW200P    </APITEM>
        <APQAVL type="2" typename="numeric">23</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW201P    </APITEM>
        <APQAVL type="2" typename="numeric">6</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGOMNIHA300  </APITEM>
        <APQAVL type="2" typename="numeric">5</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">AP60218        </APITEM>
        <APQAVL type="2" typename="numeric">68</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90451R         </APITEM>
        <APQAVL type="2" typename="numeric">392</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90500H         </APITEM>
        <APQAVL type="2" typename="numeric">2159</APQAVL>
    </row>
</FF_IT_AP_P_S>

ItemUpsert.xml

<AS_Items table="XCHLIB.ITUP018495">
    <row>
        <UPITEM type="1" typename="char">03-020-07128   </UPITEM>
        <UPMFGP type="1" typename="char">03-020-07128   </UPMFGP>
        <UPVNDR type="1" typename="char">DANIEL DEFENSE                </UPVNDR>
        <UPUPC type="1" typename="char">815604016780   </UPUPC>
        <UPDSCR type="1" typename="char">DAN KEYMD OFFSET FLSHLGHT MT  </UPDSCR>
        <UPRTPR type="2" typename="numeric">55.00</UPRTPR>
        <UPHZCD type="1" typename="char">0</UPHZCD>
        <UPUOM type="1" typename="char">EA </UPUOM>
        <UPWGHT type="2" typename="numeric">0.5000</UPWGHT>
        <CONSUMER_DROP_SHIP type="12" typename="varchar">Y</CONSUMER_DROP_SHIP>
    </row>
    <row>
        <UPITEM type="1" typename="char">01-005-10001   </UPITEM>
        <UPMFGP type="1" typename="char">01-005-10001   </UPMFGP>
        <UPVNDR type="1" typename="char">DANIEL DEFENSE                </UPVNDR>
        <UPUPC type="1" typename="char">852548002028   </UPUPC>
        <UPDSCR type="1" typename="char">DAN OMEGA RAIL 7.0 CARBINE    </UPDSCR>
        <UPRTPR type="2" typename="numeric">229.00</UPRTPR>
        <UPHZCD type="1" typename="char">0</UPHZCD>
        <UPUOM type="1" typename="char">EA </UPUOM>
        <UPWGHT type="2" typename="numeric">1.0000</UPWGHT>
        <CONSUMER_DROP_SHIP type="12" typename="varchar">Y</CONSUMER_DROP_SHIP>
    </row>
    <row>
        <UPITEM type="1" typename="char">01-147-02038   </UPITEM>
        <UPMFGP type="1" typename="char">01-147-02038   </UPMFGP>
        <UPVNDR type="1" typename="char">DANIEL DEFENSE                </UPVNDR>
        <UPUPC type="1" typename="char">815604016704   </UPUPC>
        <UPDSCR type="1" typename="char">DAN SLIM RAIL 12.0 RIFLE      </UPDSCR>
        <UPRTPR type="2" typename="numeric">299.00</UPRTPR>
        <UPHZCD type="1" typename="char">0</UPHZCD>
        <UPUOM type="1" typename="char">EA </UPUOM>
        <UPWGHT type="2" typename="numeric">1.4000</UPWGHT>
        <CONSUMER_DROP_SHIP type="12" typename="varchar">Y</CONSUMER_DROP_SHIP>
    </row>
    <row>
        <UPITEM type="1" typename="char">01-147-22026   </UPITEM>
        <UPMFGP type="1" typename="char">01-147-22026   </UPMFGP>
        <UPVNDR type="1" typename="char">DANIEL DEFENSE                </UPVNDR>
        <UPUPC type="1" typename="char">815604016698   </UPUPC>
        <UPDSCR type="1" typename="char">DAN SLIM RAIL 15.0 RIFLE      </UPDSCR>
        <UPRTPR type="2" typename="numeric">299.00</UPRTPR>
        <UPHZCD type="1" typename="char">0</UPHZCD>
        <UPUOM type="1" typename="char">EA </UPUOM>
        <UPWGHT type="2" typename="numeric">1.5500</UPWGHT>
        <CONSUMER_DROP_SHIP type="12" typename="varchar">Y</CONSUMER_DROP_SHIP>
    </row>
</AS_Items>

ItemCatDesc.xml

<STEP-ProductInformation>
    <Products>
        <Product AnalyzerResult="included" ID="094168" ParentID="MM11083" UserTypeID="Item">
            <Name>5659R</Name>
            <ClassificationReference AnalyzerResult="" ClassificationID="" InheritedFrom="" />
            <Values>
                <Value AttributeID="Retail Price">608.69</Value>
                <Value AttributeID="Item Class">02017</Value>
                <Value AttributeID="Long Gun Web Model">Model 870</Value>
                <Value AttributeID="Item Group">02</Value>
                <Value AttributeID="Barrel Length">21 VR/RC:20 RS/FR</Value>
                <Value AttributeID="Consumer Description">Model 870 Express Compact Combo 20 Gauge 21 Inch Vent Rib Barrel and 20 Inch Fully Rifled Deer Barrel Synthetic Stock Matte Black Finish 4 Round</Value>
                <Value AttributeID="Gauge">20</Value>
                <Value AttributeID="Made In The USA">Made In The USA</Value>
                <Value AttributeID="Item Code">5659R</Value>
                <Value AttributeID="Catalog Vendor Name">REMINGTON</Value>
                <Value AttributeID="Capacity">4</Value>
                <Value AttributeID="Chamber Inch">3</Value>
                <Value AttributeID="Item Status">OPEN</Value>
                <Value AttributeID="Package Guns">Package Guns</Value>
                <Value AttributeID="Wildcat Eligible">N</Value>
                <Value AttributeID="Item Description">REM 870 EXP CPT COMBO 20 21/20</Value>
                <Value AttributeID="Primary Vendor">253LG</Value>
                <Value AttributeID="Caliber-Gauge">20 GA.</Value>
                <Value AttributeID="InventoryTyp">REG</Value>
                <Value AttributeID="ProductPageNumber">&lt;NoVersionPageNo/&gt;</Value>
                <Value AttributeID="Master Model Body Copy">Black synthetic stock. Black matte metal finish - non-reflective. Twenty one inch vent rib barrel with RemChoke and single bead sights and a twenty inch fully rifled deer barrel with rifled sights. Four round magazine capacity.</Value>
                <Value AttributeID="Master Model Header">Model 870 Express Compact Synthetic Combo</Value>
                <Value AttributeID="Vendor Group">REM</Value>
            </Values>
            <AssetCrossReference AnalyzerResult="included" AssetID="rem_5659r" Type="Primary Image" />
        </Product>
        <Product AnalyzerResult="included" ID="135950" ParentID="MM202159" UserTypeID="Item">
            <Name>85118</Name>
            <ClassificationReference AnalyzerResult="" ClassificationID="" InheritedFrom="" />
            <Values>
                <Value AttributeID="Retail Price">776.00</Value>
                <Value AttributeID="Item Class">02022</Value>
                <Value AttributeID="Long Gun Web Model">Model 930</Value>
                <Value AttributeID="Item Group">02</Value>
                <Value AttributeID="Barrel Length">24</Value>
                <Value AttributeID="Consumer Description">Model 930 Jerry Miculek JM Pro Series 12 Gauge 3 Inch Chamber 24 Inch Vent Rib Barrel Black Synthetic Stock 9 Round</Value>
                <Value AttributeID="Gauge">12</Value>
                <Value AttributeID="Made In The USA">Made In The USA</Value>
                <Value AttributeID="Item Code">85118</Value>
                <Value AttributeID="Catalog Vendor Name">MOSSBERG</Value>
                <Value AttributeID="Choke">F, M, IC</Value>
                <Value AttributeID="Capacity">9</Value>
                <Value AttributeID="Stock">Black Synthetic</Value>
                <Value AttributeID="Chamber Inch">3</Value>
                <Value AttributeID="Finish">Matte Blue</Value>
                <Value AttributeID="Item Status">OPEN</Value>
                <Value AttributeID="Wildcat Eligible">N</Value>
                <Value AttributeID="Item Description">MOS 930 JM 12 3 24VR SYN 9</Value>
                <Value AttributeID="Primary Vendor">215RT</Value>
                <Value AttributeID="Caliber-Gauge">12 GA.</Value>
                <Value AttributeID="InventoryTyp">REG</Value>
                <Value AttributeID="ProductPageNumber">&lt;NoVersionPageNo/&gt;</Value>
                <Value AttributeID="Master Model Body Copy">Competition level engineering. Features a beveled loading gate and shortened forend for ease of loading and an added barrel clamp for extended mag tube stability. Dual gas system for recoil reduction. Accu-set choke tubes includes Full, Modified and Improved Cylinder. Engraved receiver. Knurled charging handle for retracting the bolt. Quick-empy magazine button for unloading. Matte blue vent-rib barrel. Black synthetic stock and forend with checkering. Fiber optic front sight. Three inch chamber. Gun lock included.</Value>
                <Value AttributeID="Master Model Header">Model 930 Jerry Miculek JM Pro Series</Value>
                <Value AttributeID="Vendor Group">MOS</Value>
            </Values>
            <AssetCrossReference AnalyzerResult="included" AssetID="MOS_85118" Type="Primary Image" />
        </Product>
        <Product AnalyzerResult="included" ID="157866" ParentID="MM241817" UserTypeID="Item">
            <Name>JRTF614</Name>
            <ClassificationReference AnalyzerResult="" ClassificationID="" InheritedFrom="" />
            <Values>
                <Value AttributeID="Retail Price">950.00</Value>
                <Value AttributeID="Item Class">02046</Value>
                <Value AttributeID="Item Group">02</Value>
                <Value AttributeID="Barrel Length">22</Value>
                <Value AttributeID="Consumer Description">Tikka T3 Forest .22-250 Remington 22 Inch Barrel Blue Finish No Sights Walnut Stock 3 Round</Value>
                <Value AttributeID="Item Code">JRTF614</Value>
                <Value AttributeID="Caliber">.22-250 Rem.</Value>
                <Value AttributeID="Catalog Vendor Name">BERETTA</Value>
                <Value AttributeID="Capacity">3</Value>
                <Value AttributeID="Finish">Blue</Value>
                <Value AttributeID="Item Status">OPEN</Value>
                <Value AttributeID="Wildcat Eligible">N</Value>
                <Value AttributeID="Item Description">BER T3 FORST 22250 22 BL WLN 3</Value>
                <Value AttributeID="Primary Vendor">120TK</Value>
                <Value AttributeID="Caliber-Gauge">.22-250 REM.</Value>
                <Value AttributeID="InventoryTyp">REG</Value>
                <Value AttributeID="ProductPageNumber">&lt;NoVersionPageNo/&gt;</Value>
                <Value AttributeID="Master Model Body Copy">Features a roll-over cheek piece on the checkered walnut stock. Adjustable single stage trigger. Two-way linear safety. Sling swivel studs. No sights. Black rubber recoil pad. Cold hammer forged, free-floating barrel with recessed and precision crowned muzzle. Single column detachable magazine with flush release.</Value>
                <Value AttributeID="Master Model Header">Tikka T3 Forest</Value>
                <Value AttributeID="Vendor Group">BER</Value>
            </Values>
            <AssetCrossReference AnalyzerResult="included" AssetID="BER_JRTF615" Type="Primary Image" />
        </Product>
</STEP-ProductInformation>

ItemPrice.xml

<ICA_00023535_PR_P_S table="XCHLIB.ITPR00000">
    <row>
        <PRITEM type="1" typename="char">03-020-07128   </PRITEM>
        <PRPRICE type="2" typename="numeric">41.77</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-005-10001   </PRITEM>
        <PRPRICE type="2" typename="numeric">178.48</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-147-02038   </PRITEM>
        <PRPRICE type="2" typename="numeric">228.97</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-147-22026   </PRITEM>
        <PRPRICE type="2" typename="numeric">228.97</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">03-045-18025   </PRITEM>
        <PRPRICE type="2" typename="numeric">65.50</PRPRICE>
    </row>
</ICA_00023535_PR_P_S>

ItemDelete.xml

<FF_IT_DL_P_S table="XCHLIB.ITDL00000">
    <row>
        <DLITEM type="1" typename="char">S3H5V242412PHC </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">APX82424411    </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">APX81224411    </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">5-93801        </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">APX81224411    </DLITEM>
    </row>
</FF_IT_DL_P_S>

Sorry for the confusion here, I’m not trying to merge all the different file types into 1. Here is the way I have everything set up.

In 15 minute intervals, I auto download a batch of files, there is usually 3 or 4 files of each type So I end up with something like this:

ItemAvailability-111.xml, ItemAvailability-222.xml, ItemAvailability-333.xml ItemUpsert-111.xml, ItemUpsert-222.xml, ItemUpsert-333.xml ItemCatDesc-111.xml, ItemCatDesc-222.xml, ItemCatDesc-333.xml ItemPrice-111.xml, ItemPrice-222.xml, ItemPrice-333.xml ItemDelete-111.xml, ItemDelete-222.xml

Then my main script sorts these files and runs the merge script on each file type, so when it’s finished I have 5 files (ItemAvailability-combined.xml, ItemUpsert-combined.xml, ItemCatDesc-combined.xml, ItemPrice-combined.xml and ItemDelete-combined.xml) to import into my store.

Advertisement

Answer

The following removes the duplicate rows from the combined files. The duplicate rows are extracted using sub-elements of each row.

#!/usr/bin/env python
import sys
from xml.etree import ElementTree

def run(files):
    first = None
    for filename in files:
        data = ElementTree.parse(filename).getroot()
        if first is None:
            first = data
        else:
            first.extend(data)
    # Creating without duplicates here, which contains the unique list of elements determined by values of subelements
    withoutDuplicates = []
    if first is not None:
        for row in first.findall('row'):
            if ( (row[0].text, row[1].text) ) in withoutDuplicates:
                first.remove(row) # Removing duplicate row
            else:
                withoutDuplicates.append((row[0].text, row[1].text))
        print ElementTree.tostring(first)

if __name__ == "__main__":
    run(sys.argv[1:])

Merging the file with itself to demonstrate duplicate entries from other files also:

python mergeXML.py sample.xml sample.xml

And the result:

<FF_IT_AP_P_S table="XCHLIB.ITAP018495">
    <row>
        <APITEM type="1" typename="char">AK-S30         </APITEM>
        <APQAVL type="2" typename="numeric">20</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90500H         </APITEM>
        <APQAVL type="2" typename="numeric">2159</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">AP60218        </APITEM>
        <APQAVL type="2" typename="numeric">68</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW200P    </APITEM>
        <APQAVL type="2" typename="numeric">23</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW201P    </APITEM>
        <APQAVL type="2" typename="numeric">6</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGOMNIHA300  </APITEM>
        <APQAVL type="2" typename="numeric">5</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90451R         </APITEM>
        <APQAVL type="2" typename="numeric">392</APQAVL>
    </row>
    </FF_IT_AP_P_S>

Based on the update in the question, just change the set comparison to single element comparison. So your for loop should be:

    for row in first.findall('row'):
        if ( row[0].text) in withoutDuplicates:
            first.remove(row) # Removing duplicate row
        else:
            withoutDuplicates.append(row[0].text)

And when you can run

python mergeXML.py sample.xml itemDelete.xml itemPrice.xml > mergedFile.xml

And mergedFile.xml is :

<FF_IT_AP_P_S table="XCHLIB.ITAP00000">
    <row>
        <APITEM type="1" typename="char">AK-S30         </APITEM>
        <APQAVL type="2" typename="numeric">20</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90500H         </APITEM>
        <APQAVL type="2" typename="numeric">2159</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">AP60218        </APITEM>
        <APQAVL type="2" typename="numeric">68</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW200P    </APITEM>
        <APQAVL type="2" typename="numeric">23</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW201P    </APITEM>
        <APQAVL type="2" typename="numeric">6</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGOMNIHA300  </APITEM>
        <APQAVL type="2" typename="numeric">5</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90451R         </APITEM>
        <APQAVL type="2" typename="numeric">392</APQAVL>
    </row>
    <row>
        <DLITEM type="1" typename="char">S3H5V242412PHC </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">APX82424411    </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">APX81224411    </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">5-93801        </DLITEM>
    </row>
    <row>
        <PRITEM type="1" typename="char">03-020-07128   </PRITEM>
        <PRPRICE type="2" typename="numeric">41.77</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-005-10001   </PRITEM>
        <PRPRICE type="2" typename="numeric">178.48</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-147-02038   </PRITEM>
        <PRPRICE type="2" typename="numeric">228.97</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-147-22026   </PRITEM>
        <PRPRICE type="2" typename="numeric">228.97</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">03-045-18025   </PRITEM>
        <PRPRICE type="2" typename="numeric">65.50</PRPRICE>
    </row>
</FF_IT_AP_P_S>

You can have arbitrary number of files with complete tags.

User contributions licensed under: CC BY-SA
4 People found this is helpful
Advertisement