/* BasicCleaning.do Created by : Isabelle, August 2019 This Dofile : - Applies basic cleaning steps that are common to all sources What is done : 1) Keep only valid siren 2) Keep only valid country code 3) Keep only valid nc8 code 4) Keep only valid flows (no flow should be export=0 & kgs=0) */ * 0. Define working paths and names of input and output datasets global datapath // Working path global dopath // Where do files are saved global dataset // Name of the input dataset global outputdata // Name of the output dataset set more off, perma cd $datapath do "$dopath\id_group.do" use $dataset, clear * Variables an mois flux siren nc8 cpa6 pyod payp euros indicmasse kgs unites year rename pyod iso2 rename euros export *1 Keep Valid siren numbers tostring siren, replace replace siren="0"+siren if length(siren)==8 replace siren="00"+siren if length(siren)==7 drop if length(siren)<=5 #delimit ; drop if siren=="000000000"|siren=="111111111"|siren=="222222222" |siren=="202020202" |siren=="777777777"|siren=="888888888"|siren=="999999999"; #delimit cr *2 Keep valid destination/origin countries /* *Codes QQ,QU,QV,QW are anonymized for defense reason ==> dropped *Codes TP XA XC XE XF XG XH XI XK XL XM XO XP XR XS are user-defined codes ==> dropped" *https://www.iso.org/fr/glossary-for-iso-3166.html * XU = Belgium and Luxembourg until 99 * YU Yougoslavia until 2003 * CS : Serbia Montenegra 2004-2005 * TP : Portuguese Timor in 2001-2002 * AN : Netherland Antilles from 95- 2010 */ /* Many EU code as destination country : - can stand for US "Etats Unis" - codes used when people don't know what to put - code used for re-exported goods from switzerland FR code as the origin of Imports can happen when a) French goods are imported from another country and the re-exported from this country to France. Ex: A Belgian wine dealer imports champagn and then sale part of it in France b) Defective exported goods that are re-imported, temporary exports c) Misreporting Decison : Drop these flows FR code as the destination country Could be related to exports from DOM-TOM ==> dropped */ gen drop=1 if inlist(iso2,"QQ","QP","QU","QV","QW","FR") replace drop=1 if inlist(iso2,"GL","PA","XX") replace drop=1 if iso2=="EU" *replace drop=1 if inlist(iso2,"XA","XC","XE","XF","XG") *replace drop=1 if inlist(iso2,"XH","XI","XK","XL","XM") *replace drop=1 if inlist(iso2,"XO","XP","XR","XS","XV","XZ") drop if drop==1 drop drop replace iso2="GR" if iso2=="EL" *3 Clean Products * NC8 99500000 is a code used for small transactions and "envois groupés" * NC8 9880XX00 - XX being 73, 84, 99 or 85 - is a code used for large manufacturing or commercial firms which import values above 3 million euros and benefit from a simplified code grouping all products together * L’arrêté du 26 décembre 2000 (modifié) fixe les conditions d’utilisation de la nomenclature spécifique des produits * 99050000 : personnal belongings of moving individuals * 99190000 : belongings of catastrophy victims (coffin...) * 9930 : Refuelling * 9931 : goods for offsghores installations/infrastructures tostring nc8, replace replace nc8="0"+nc8 if length(nc8)==7 replace nc8="00"+nc8 if length(nc8)==6 replace nc8="000"+nc8 if length(nc8)==5 replace nc8="0000"+nc8 if length(nc8)==4 replace nc8="00000"+nc8 if length(nc8)==3 replace nc8="000000"+nc8 if length(nc8)==2 replace nc8="0000000"+nc8 if length(nc8)==1 g nc8group=(nc8=="99500000" | nc8=="98807300" | nc8=="98808400" | nc8=="98809900" | nc8=="98808500" ) g nc8part = (nc8=="99699999" |nc8=="99050000" |nc8=="99190000" |nc8=="99300000"|nc8=="99310000") egen drop=sum(export) if nc8group==1| nc8part==1 drop if nc8part == 1 drop if nc8group==1 drop nc8group nc8part drop drop if nc8 == "0000000." *4 : remove flows with export=0 and kgs=0 count if export==0 & kgs==0 & unites==0 drop if export==0 & kgs==0 & unites==0 save $outputdata, replace