diff --git a/docs/examples/copy_faker2file/config.yml b/docs/examples/copy_faker2file/config.yml index 76a95ebfc20ce2fdadbe3599fc1f1fd53530156a..9efefb39683a725d751e547bb9b94d5257746e25 100644 --- a/docs/examples/copy_faker2file/config.yml +++ b/docs/examples/copy_faker2file/config.yml @@ -19,3 +19,6 @@ jobs: connector: !file mode: w+ uri: 'file://./fakeperson.csv' + dialect: !dialect + lineterminator: "\n" + quoting: 0 \ No newline at end of file diff --git a/docs/examples/copy_file2file/config.yml b/docs/examples/copy_file2file/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..335f9b634d0dc1d57f8322a59b8e96df47ae5d8d --- /dev/null +++ b/docs/examples/copy_file2file/config.yml @@ -0,0 +1,40 @@ +--- +connectors: + - + !file + name: inputFile + uri: 'file://./data/input.csv' + encoding: 'utf-8' + - + !file + name: outputFile + uri: 'file://./output.csv' + mode: w+ +--- +entities: + - + !entity + name: person + properties: + surname: 'string' + name: 'string' + md5: 'string' +--- +jobs: + - + !copy + entity: person + source: + extractor: !csvextractor + connector: inputFile + dialect: !dialect + delimiter: ':' + lineterminator: "\n" + quoting: 1 + target: + loader: !csvloader + connector: outputFile + dialect: !dialect + delimiter: ',' + lineterminator: "\n" + quoting: 0 \ No newline at end of file diff --git a/docs/examples/copy_file2file/data/expected_output.csv b/docs/examples/copy_file2file/data/expected_output.csv new file mode 100644 index 0000000000000000000000000000000000000000..da8d62cbd9c29ff9515ba070be0f129c333ed7f4 --- /dev/null +++ b/docs/examples/copy_file2file/data/expected_output.csv @@ -0,0 +1,101 @@ +surname,name,md5 +Corkery,Jack,a58bdfbcbcffac8dc6874d0b9b12332c +Purdy,Moses,f4b65103b4348e0edcf29e1cc89b80d4 +Schmidt,Kacey,000534c2af80dcbfc86a3a6ce4114bd4 +Kling,Alyson,fa70e25665a1dcbbffdf15210e7f7919 +West,Etha,eb205d543e087790aef4f6c67dbd3c41 +Runte,Connie,22de2c6a03a850c1114b7abac1b58e88 +Dickens,Fae,49d67e6fbbe037d2eb872b00f948a243 +Windler,Cortez,5b5fc22d532afbf5666b1aa2e06e24a0 +Lueilwitz,Stephanie,278aba733fd2538d249af23f491e1408 +Crooks,Lyla,34df319ae5078c623e946331efd39613 +Abshire,Golden,e1d1c4aa6493f4e03a7c3da441607e2b +Mann,Clay,383dae9f9e84e22c08bf1876ec2d5f34 +Volkman,Dave,d8f9bff13180516d5c855f81d1aee1a6 +Nienow,Sterling,d769d497f2dddbda623541d2654e22c3 +Lubowitz,Vivien,5954f5a39463a1537992e18c434dff6d +Raynor,Opal,87237b65aed44f8923a0f88a1cbf7293 +Green,Lamont,929a7d0a4bd4d8a0cf3333f982df6913 +Heaney,Ruby,46e54e2808b2703d3b7fef4f2b32ef35 +Kshlerin,Noemy,05d7bccb695d34c32ca1d6c57248f452 +Toy,Maci,24b0ebc4f2211b46584f88805f2d73b0 +Stamm,Sylvia,63b8ec77a11b1fdef2ebad03390d0449 +Runolfsson,Jonathan,9711e4233a5ec0829f18565518eec660 +McCullough,Caitlyn,d5426f6bf7d631c7c6878cd291dbcd00 +Stoltenberg,Casimir,8bcb701ccfbba3879755842dddd20713 +Purdy,Bo,7b0c94073b15ef36a8f484b8a7018eeb +Mertz,Dahlia,e28a885e766a45d33cad2651e33c7c9c +Mraz,Mac,2b4b5ee8ef339da174400ae03964f596 +Mante,Estella,dd10d779fb9f4b87e5c7a4f44cce3a3c +Buckridge,Clay,9c88404db46c4783ee2a7ff886acc2e1 +Reichel,Corene,327f72f579a4bcaf0c7fe497f8547505 +Weber,Gregoria,742df07ecd1093e0857d5198cec56116 +Mosciski,Magdalen,2dea2861226eb7db026d5b9168958abc +Littel,May,f5b5965cd5f502bfd1069a676c2fb315 +Schmeler,Savion,69683238030d211737dfe5ba68619081 +Donnelly,Delpha,ddabbe379f9e7f8f70da8777415c39f8 +Connelly,Lucio,9426741c677091d3f3d0938465121eea +Bergstrom,Tyshawn,4b04f7196b71ff9d71a8953c1fee33a1 +Lind,Jaren,ee400c603fd1b8bcdac2d4c4f777b040 +Klein,Clementine,79904e1168fa4949d080862b61da45ec +Fritsch,Burnice,93270a9e12d5ab4cf377c4aacf399ff4 +Jenkins,Tom,6282403a66445bc9794cec6c6e97ddb4 +Wuckert,Gregorio,42b9b3806dd77f1404cbab764fd7e1dc +Quitzon,Clarissa,2cf487a510eab609dd7b09583a080582 +Hudson,Donna,0ee2034f98c05dfa6097ec557696e0a1 +Gutmann,Trent,4d4f6ae782a04b6f46761230e7602215 +Luettgen,Darion,b8a1e21cc37eb6fc756d4c6993a3a321 +Block,Eliane,f6cd4c2a7532f01e111cac76e23534e8 +Muller,Rozella,8d12d10037e26a135d879c567a176448 +Herman,Margie,5a2efccfee605d2a479cee9fe58ec099 +Rempel,Christina,8435faf7c096742ca19283c7a0bcf093 +Bahringer,Lia,e4e04a5fd33bf1cbe4d81abd7d0b5303 +Schmidt,Bud,fe29989f5b9bbd0c3eee17f298c3db47 +Abernathy,Murray,a8b2f7e7325d498b0701c153320d52ac +Labadie,Chance,763279919c64c3c184d616dbbe636dbc +Kihn,Craig,984ab5d01a96e86b832d0816d5ae6b9c +Spinka,Katherine,f022834ace9100a2ea6e80ee15c3c078 +Muller,Loraine,edc2fab54280754aa617ebcfd7623658 +McDermott,Krystina,fc8727640a55a38ae0aec5a5d12405ea +Gusikowski,Sally,75ba9c800724ca29fb354b05e1b1821c +Ernser,Glenda,11835f321233f1057e95094b6b571183 +Quigley,Sabrina,a4fa9f325ddffc7ba9754cd2cc43ad6e +Beahan,Ila,f9925e696406e07a66c6db00db4465ba +Crist,Bennett,baea14e62ac3738eebe57d9d6ffdf648 +Muller,Austin,aab3f778e052623d5368a361b10a027a +Wintheiser,Brooklyn,9f4fd24c761d091aa2ec6b3d8cc964e1 +McCullough,Cecil,1efdbe65338d162a129f335ac9cb7ca7 +Rippin,Genesis,5102f849dd69094b1ce898e1c1d2caef +Braun,Robb,d58ed4a07e4d0f9c7f1e85346e375653 +Buckridge,Taylor,298088519c3a144ac632ac0a4f96f5ed +Volkman,Earline,1a294690883cc30b31ef8c490cf96eb3 +Satterfield,Faustino,685151fc6da4a0e42b622075783d189a +Ankunding,Jody,c50e4a093af50c2c3bdc5a021e42580a +Cassin,London,0031960bcb021f9802f4b977afb9de36 +Krajcik,Johnathon,11d5a432eafdcf3d864a614b101a5362 +Beahan,Stevie,af6c944f3beab59524c9a573a7ab8c07 +McGlynn,Aletha,e40055946d8b214fe43f691292be5212 +Windler,Obie,0613db2754bf924189f23d7c574bdef1 +Batz,Kody,f4d9decf44266c5302f67ff6bdc63cb4 +Yost,Adella,408b3f76b9b7dfc9d11ee406d9387e8e +Thompson,Blanche,2b6af6358ba1bc3a8641f374319e0d90 +Haag,Shannon,05372e8ae82a0d4c787738d1ac6772db +Watsica,Claudie,f47b376d7935c11be5f9ab7be80f5141 +Heaney,Nya,54bfdcd6cd4f17ec69b59481b58e675c +Weber,Mervin,6e346e7558dbe83c3af835b6ce83a0cc +Haley,Conner,19cbbdb21e61eb5550d1cc74801645a1 +O'Reilly,Imogene,e9690b84774251c14014f4aa390d48fe +Armstrong,Sylvan,459631981a96ff6cd490c78fb951e718 +Roberts,Rosalinda,e8bf489c2f5fe285de626c024f156553 +Lindgren,Wellington,0348bbd0f013b2aed90106c431655c45 +Kunde,Hermann,ca59c4ee04587702bd8496bf6da9f8be +Dietrich,Chesley,ee36f44db1abfc8a14aa220c93f6053d +Emmerich,Eldred,9743ca1cdd40b5697f3f0f52b91c8ec7 +White,Devan,c0f62e869f6d4e26a61542bdeea9204c +Gleason,Russell,6e001b6dd1c043b172048114776a06a7 +Olson,Augusta,a2a7aca03a83df3144ea6ea553af0eef +Dach,Lewis,712cba7f187e5e5b8811519b9c7422f5 +Rutherford,Josh,de66c30e84e202292b5015d65b3a192a +Bahringer,Roel,4437b8896a7e0aa9b240544ccf0116a5 +Kunze,Xzavier,592783b0b3aa7bb7daece121c1d18d66 +Hermann,Tod,fd452a8ff12cbe2919925b4100625717 diff --git a/docs/examples/copy_file2file/data/input.csv b/docs/examples/copy_file2file/data/input.csv new file mode 100644 index 0000000000000000000000000000000000000000..690cfecb2d33ef118d50147f30487a24593513d8 --- /dev/null +++ b/docs/examples/copy_file2file/data/input.csv @@ -0,0 +1,101 @@ +"surname":"name":"md5" +"Corkery":"Jack":"a58bdfbcbcffac8dc6874d0b9b12332c" +"Purdy":"Moses":"f4b65103b4348e0edcf29e1cc89b80d4" +"Schmidt":"Kacey":"000534c2af80dcbfc86a3a6ce4114bd4" +"Kling":"Alyson":"fa70e25665a1dcbbffdf15210e7f7919" +"West":"Etha":"eb205d543e087790aef4f6c67dbd3c41" +"Runte":"Connie":"22de2c6a03a850c1114b7abac1b58e88" +"Dickens":"Fae":"49d67e6fbbe037d2eb872b00f948a243" +"Windler":"Cortez":"5b5fc22d532afbf5666b1aa2e06e24a0" +"Lueilwitz":"Stephanie":"278aba733fd2538d249af23f491e1408" +"Crooks":"Lyla":"34df319ae5078c623e946331efd39613" +"Abshire":"Golden":"e1d1c4aa6493f4e03a7c3da441607e2b" +"Mann":"Clay":"383dae9f9e84e22c08bf1876ec2d5f34" +"Volkman":"Dave":"d8f9bff13180516d5c855f81d1aee1a6" +"Nienow":"Sterling":"d769d497f2dddbda623541d2654e22c3" +"Lubowitz":"Vivien":"5954f5a39463a1537992e18c434dff6d" +"Raynor":"Opal":"87237b65aed44f8923a0f88a1cbf7293" +"Green":"Lamont":"929a7d0a4bd4d8a0cf3333f982df6913" +"Heaney":"Ruby":"46e54e2808b2703d3b7fef4f2b32ef35" +"Kshlerin":"Noemy":"05d7bccb695d34c32ca1d6c57248f452" +"Toy":"Maci":"24b0ebc4f2211b46584f88805f2d73b0" +"Stamm":"Sylvia":"63b8ec77a11b1fdef2ebad03390d0449" +"Runolfsson":"Jonathan":"9711e4233a5ec0829f18565518eec660" +"McCullough":"Caitlyn":"d5426f6bf7d631c7c6878cd291dbcd00" +"Stoltenberg":"Casimir":"8bcb701ccfbba3879755842dddd20713" +"Purdy":"Bo":"7b0c94073b15ef36a8f484b8a7018eeb" +"Mertz":"Dahlia":"e28a885e766a45d33cad2651e33c7c9c" +"Mraz":"Mac":"2b4b5ee8ef339da174400ae03964f596" +"Mante":"Estella":"dd10d779fb9f4b87e5c7a4f44cce3a3c" +"Buckridge":"Clay":"9c88404db46c4783ee2a7ff886acc2e1" +"Reichel":"Corene":"327f72f579a4bcaf0c7fe497f8547505" +"Weber":"Gregoria":"742df07ecd1093e0857d5198cec56116" +"Mosciski":"Magdalen":"2dea2861226eb7db026d5b9168958abc" +"Littel":"May":"f5b5965cd5f502bfd1069a676c2fb315" +"Schmeler":"Savion":"69683238030d211737dfe5ba68619081" +"Donnelly":"Delpha":"ddabbe379f9e7f8f70da8777415c39f8" +"Connelly":"Lucio":"9426741c677091d3f3d0938465121eea" +"Bergstrom":"Tyshawn":"4b04f7196b71ff9d71a8953c1fee33a1" +"Lind":"Jaren":"ee400c603fd1b8bcdac2d4c4f777b040" +"Klein":"Clementine":"79904e1168fa4949d080862b61da45ec" +"Fritsch":"Burnice":"93270a9e12d5ab4cf377c4aacf399ff4" +"Jenkins":"Tom":"6282403a66445bc9794cec6c6e97ddb4" +"Wuckert":"Gregorio":"42b9b3806dd77f1404cbab764fd7e1dc" +"Quitzon":"Clarissa":"2cf487a510eab609dd7b09583a080582" +"Hudson":"Donna":"0ee2034f98c05dfa6097ec557696e0a1" +"Gutmann":"Trent":"4d4f6ae782a04b6f46761230e7602215" +"Luettgen":"Darion":"b8a1e21cc37eb6fc756d4c6993a3a321" +"Block":"Eliane":"f6cd4c2a7532f01e111cac76e23534e8" +"Muller":"Rozella":"8d12d10037e26a135d879c567a176448" +"Herman":"Margie":"5a2efccfee605d2a479cee9fe58ec099" +"Rempel":"Christina":"8435faf7c096742ca19283c7a0bcf093" +"Bahringer":"Lia":"e4e04a5fd33bf1cbe4d81abd7d0b5303" +"Schmidt":"Bud":"fe29989f5b9bbd0c3eee17f298c3db47" +"Abernathy":"Murray":"a8b2f7e7325d498b0701c153320d52ac" +"Labadie":"Chance":"763279919c64c3c184d616dbbe636dbc" +"Kihn":"Craig":"984ab5d01a96e86b832d0816d5ae6b9c" +"Spinka":"Katherine":"f022834ace9100a2ea6e80ee15c3c078" +"Muller":"Loraine":"edc2fab54280754aa617ebcfd7623658" +"McDermott":"Krystina":"fc8727640a55a38ae0aec5a5d12405ea" +"Gusikowski":"Sally":"75ba9c800724ca29fb354b05e1b1821c" +"Ernser":"Glenda":"11835f321233f1057e95094b6b571183" +"Quigley":"Sabrina":"a4fa9f325ddffc7ba9754cd2cc43ad6e" +"Beahan":"Ila":"f9925e696406e07a66c6db00db4465ba" +"Crist":"Bennett":"baea14e62ac3738eebe57d9d6ffdf648" +"Muller":"Austin":"aab3f778e052623d5368a361b10a027a" +"Wintheiser":"Brooklyn":"9f4fd24c761d091aa2ec6b3d8cc964e1" +"McCullough":"Cecil":"1efdbe65338d162a129f335ac9cb7ca7" +"Rippin":"Genesis":"5102f849dd69094b1ce898e1c1d2caef" +"Braun":"Robb":"d58ed4a07e4d0f9c7f1e85346e375653" +"Buckridge":"Taylor":"298088519c3a144ac632ac0a4f96f5ed" +"Volkman":"Earline":"1a294690883cc30b31ef8c490cf96eb3" +"Satterfield":"Faustino":"685151fc6da4a0e42b622075783d189a" +"Ankunding":"Jody":"c50e4a093af50c2c3bdc5a021e42580a" +"Cassin":"London":"0031960bcb021f9802f4b977afb9de36" +"Krajcik":"Johnathon":"11d5a432eafdcf3d864a614b101a5362" +"Beahan":"Stevie":"af6c944f3beab59524c9a573a7ab8c07" +"McGlynn":"Aletha":"e40055946d8b214fe43f691292be5212" +"Windler":"Obie":"0613db2754bf924189f23d7c574bdef1" +"Batz":"Kody":"f4d9decf44266c5302f67ff6bdc63cb4" +"Yost":"Adella":"408b3f76b9b7dfc9d11ee406d9387e8e" +"Thompson":"Blanche":"2b6af6358ba1bc3a8641f374319e0d90" +"Haag":"Shannon":"05372e8ae82a0d4c787738d1ac6772db" +"Watsica":"Claudie":"f47b376d7935c11be5f9ab7be80f5141" +"Heaney":"Nya":"54bfdcd6cd4f17ec69b59481b58e675c" +"Weber":"Mervin":"6e346e7558dbe83c3af835b6ce83a0cc" +"Haley":"Conner":"19cbbdb21e61eb5550d1cc74801645a1" +"O'Reilly":"Imogene":"e9690b84774251c14014f4aa390d48fe" +"Armstrong":"Sylvan":"459631981a96ff6cd490c78fb951e718" +"Roberts":"Rosalinda":"e8bf489c2f5fe285de626c024f156553" +"Lindgren":"Wellington":"0348bbd0f013b2aed90106c431655c45" +"Kunde":"Hermann":"ca59c4ee04587702bd8496bf6da9f8be" +"Dietrich":"Chesley":"ee36f44db1abfc8a14aa220c93f6053d" +"Emmerich":"Eldred":"9743ca1cdd40b5697f3f0f52b91c8ec7" +"White":"Devan":"c0f62e869f6d4e26a61542bdeea9204c" +"Gleason":"Russell":"6e001b6dd1c043b172048114776a06a7" +"Olson":"Augusta":"a2a7aca03a83df3144ea6ea553af0eef" +"Dach":"Lewis":"712cba7f187e5e5b8811519b9c7422f5" +"Rutherford":"Josh":"de66c30e84e202292b5015d65b3a192a" +"Bahringer":"Roel":"4437b8896a7e0aa9b240544ccf0116a5" +"Kunze":"Xzavier":"592783b0b3aa7bb7daece121c1d18d66" +"Hermann":"Tod":"fd452a8ff12cbe2919925b4100625717" diff --git a/docs/examples/copy_sqlite2file/config.yml b/docs/examples/copy_sqlite2file/config.yml index bd1537a304c9337127b1e313dae3a89f48fd1f84..187183026b99479b8f98f39713767a16f496957e 100644 --- a/docs/examples/copy_sqlite2file/config.yml +++ b/docs/examples/copy_sqlite2file/config.yml @@ -17,3 +17,7 @@ jobs: connector: !file mode: w+ uri: 'file://./person.csv' + dialect: !dialect + delimiter: ',' + lineterminator: "\n" + quoting: 0 diff --git a/docs/source/index.rst b/docs/source/index.rst index c2845da8326b52b2293ee6f956025b5bab3504fb..7b9f9f11b388b52b4d0593ad626f65c67a5c6462 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,5 +20,6 @@ hshetl documentation extractors transformers loaders + utilities develop diagramms diff --git a/docs/source/utilities.rst b/docs/source/utilities.rst new file mode 100644 index 0000000000000000000000000000000000000000..f81e067649596b78a258af8645c1584502425be7 --- /dev/null +++ b/docs/source/utilities.rst @@ -0,0 +1,4 @@ +utilities +========= + +.. autoclass:: hshetl.Dialect \ No newline at end of file diff --git a/hshetl/__init__.py b/hshetl/__init__.py index 1da01490c1b2eb523d489bd96504fd832de39871..c37e00465eb7438057a614c44deb0ea1a5d743d5 100644 --- a/hshetl/__init__.py +++ b/hshetl/__init__.py @@ -9,6 +9,7 @@ import yaml import logging import functools import inspect +import csv from collections import OrderedDict from exc import NotMatchingYAMLTagException, NotMatchingArgumentsException, ConfigurationException, UnknownNameReferenceException, DuplicateNameException, NameResolverException @@ -243,6 +244,63 @@ class AbstractRepository(OrderedDict, object): return 'AbtractRepository: ' + super(AbstractRepository, self).__repr__() +@yamlify +class Dialect(csv.Dialect): + '''A Dialect class for csv dialects + + .. seealso:: + http://docs.python.org/2/library/csv.html#dialects-and-formatting-parameters + + :param str delimiter: + Default ',' + :param boolean doublequote: + Default True + :param str/False escapechar: + Default None + :param str lineterminator: + Default ``\\n`` + :param str quotechar: + Default '"' + :param boolean skipinitialspace: + Default False + :param int quoting: + Default 0 + + Example yaml configuration: + + .. code-block:: yaml + + loader: !csvloader + connector: outputFile + dialect: !dialect + delimiter: ',' + lineterminator: "\\n" + quoting: 0 + + .. note:: + Escape sequences must be given in double tics + + ''' + + yaml_tag = u'!dialect' + + def __init__(self, + delimiter=',', + doublequote = True, + escapechar = None, + lineterminator = '\n', + quotechar = '"', + skipinitialspace = False, + quoting = 0): + self.delimiter = delimiter + self.doublequote = doublequote + self.escapechar = escapechar + self.lineterminator = str(lineterminator) + self.quotechar = quotechar + self.skipinitialspace = skipinitialspace + self.quoting = quoting + + import entities from entities import * import connectors diff --git a/hshetl/extractors.py b/hshetl/extractors.py index 43d5458bb82e3fd468298bbb4e52e64e989be112..bffc6e5fc0ad2246ee252789b5f8762b7272b5e6 100644 --- a/hshetl/extractors.py +++ b/hshetl/extractors.py @@ -23,8 +23,9 @@ An extractor inside of a YAML job definition with more parameters: ''' import logging import csv +from uuid import uuid4 as uuid import ldap -from hshetl import yamlify, NameResolver +from hshetl import yamlify, NameResolver, Dialect from faker import Factory from exc import ExtractorException, ConfigurationException from ldap.controls import SimplePagedResultsControl @@ -271,8 +272,8 @@ class CsvExtractor(AbstractExtractor): Construction: - :param str delimiter: - The delimiter used in the csv. + :param hshetl.Dialect dialect: + The CSV dialect that ill be used for CSV style. :param dict ``**kwargs``: Accepts parameters from :class:`.AbstractExtractor`. @@ -282,22 +283,24 @@ class CsvExtractor(AbstractExtractor): !csvextractor connector: myfile - delimiter: ',' + dialect: !dialect + lineterminator: "\\n" + quoting: 0 ''' yaml_tag = u'!csvextractor' '''Use this tag inside your YAML configuration, to define this extractor.''' - def __init__(self, delimiter = ',', *args, **kwargs): + def __init__(self, dialect = Dialect(), *args, **kwargs): '''Initializes the CsvExtrator and cares about the given configuration. You can define the delimiter in the configuration. ''' super(CsvExtractor, self).__init__(*args, **kwargs) - self.delimiter = delimiter - '''The delimiter used in your CSV data.''' + self.dialect = dialect + self._resolve_dialect() def can_execute(self, connector): '''Defines which connector can be handled by this extractor.''' @@ -312,7 +315,7 @@ class CsvExtractor(AbstractExtractor): result = [] try: with self.connector as connection: - reader = csv.DictReader(connection, delimiter = self.delimiter) + reader = csv.DictReader(connection, dialect=self.dialect_name) for row in reader: for key, value in row.iteritems(): row[key] = value.decode(self.connector.encoding) @@ -321,6 +324,14 @@ class CsvExtractor(AbstractExtractor): logging.warn('Can not open file {}, this may be ok, if you will write into this file.'.format(self.connector.path)) return result + def _resolve_dialect(self): + if isinstance(self.dialect, str): + pass + else: + self.dialect_name = unicode(uuid()) + csv.register_dialect(self.dialect_name, self.dialect) + + @yamlify class FakerExtractor(AbstractExtractor): diff --git a/hshetl/loaders.py b/hshetl/loaders.py index 65e413b3ff94c55f440d86a0aec2da5a364a79c6..3e8cddf61d50662da2348f237582f17034c29342 100644 --- a/hshetl/loaders.py +++ b/hshetl/loaders.py @@ -25,9 +25,11 @@ A loader inside of a YAML job definition with more parameters: from __future__ import print_function import logging import codecs +import csv +from uuid import uuid4 as uuid from ldap import modlist from sqlalchemy import Table, MetaData -from hshetl import yamlify, NameResolver +from hshetl import yamlify, NameResolver, Dialect from connectors import AbstractConnector, connector_repository from hshetl.exc import LoaderException, ConfigurationException @@ -105,28 +107,31 @@ class CsvLoader(AbstractLoader): Construction: - :param str delimiter: - The delimiter that will be used in the csv. + :param hshetl.Dialect dialect: + The CSV dialect that ill be used for CSV style. :param dict ``**kwargs``: Accepts parameters from :class:`.AbstractExtractor`. YAML definition sample: .. code-block:: yaml - + !csvloader connector: myfile - delimiter: ',' + dialect: !dialect + lineterminator: "\\n" + quoting: 0 ''' yaml_tag = u'!csvloader' '''Use this tag inside your YAML configuration, to define this loader.''' - def __init__(self, delimiter = ',', **kwargs): + def __init__(self, dialect = Dialect(), **kwargs): super(CsvLoader, self).__init__(**kwargs) - self.delimiter = delimiter - '''The delimiter used to separate columns in the CSV.''' + self.dialect = dialect + self._resolve_dialect() + def _update(self, data = []): '''This loader can not update records. Therefore only empty data is allowed''' @@ -137,28 +142,35 @@ class CsvLoader(AbstractLoader): '''Writes data in csv format''' if data == []: logging.info('Attempting to write nothing into the file: ' + self.connector.path); return else: logging.info('Attempting to write ' + repr(len(data)) + ' records into the : ' + self.connector.path) - header = self.delimiter.join(data[0].to_dict().iterkeys()) - logging.info('Writing header: ' + header) + header = data[0].to_dict().keys() + logging.info('Writing header: ' + str(header)) + with self.connector as connection: # if self.connector.encoding: connection.write(codecs.BOM_UTF8) - print(header.encode(self.connector.encoding), file = connection) + writer = csv.writer(connection, self.dialect_name) + writer.writerow(header) for record in data: - line = u'' + line = [] for prop in record.to_dict().itervalues(): if type(prop) == list: - line += '|'.join(map(lambda x: str(x), prop)) + line.append(u'|'.join(map(lambda x: str(x), prop))) else: - line += unicode(prop) - line += self.delimiter - line = line[:-1] - logging.debug('Writing line: ' + line) - print(line.encode(self.connector.encoding), file = connection) + line.append(unicode(prop)) + logging.debug('Writing line: ' + str(line)) + writer.writerow(line) def _delete(self, data = []): '''This loader can not delete records in the target. Therefore only empty data is allowed''' if data == []: return raise LoaderException('Deletion is not available in the CsvLoader') + def _resolve_dialect(self): + if isinstance(self.dialect, str): + pass + else: + self.dialect_name = unicode(uuid()) + csv.register_dialect(self.dialect_name, self.dialect) + @yamlify class SqlAlchemyLoader(AbstractLoader): diff --git a/hshetl/test/functional/test_copy_faker2file.py b/hshetl/test/functional/test_copy_faker2file.py index 4cd87e8adcbf6a61f8cac42cabd7aec0583ce4de..0392669e5588c254c09ff9a173f2b3f9c808a955 100644 --- a/hshetl/test/functional/test_copy_faker2file.py +++ b/hshetl/test/functional/test_copy_faker2file.py @@ -15,4 +15,4 @@ class TestFakerCopyJob(TestBaseClass): def test_job(self): Controller().configure().execute() - self.assertTrue(filecmp.cmp(self.test_working_directory + 'data/expected_fakeperson.csv', self.test_working_directory + 'fakeperson.csv')) \ No newline at end of file + self.assertTrue(filecmp.cmp(self.test_working_directory + 'data/expected_fakeperson.csv', self.test_working_directory + 'fakeperson.csv')) diff --git a/hshetl/test/functional/test_copy_file2file.py b/hshetl/test/functional/test_copy_file2file.py new file mode 100644 index 0000000000000000000000000000000000000000..05e0a32a9b40384a4b85a0517df6c07034b69e55 --- /dev/null +++ b/hshetl/test/functional/test_copy_file2file.py @@ -0,0 +1,18 @@ +from hshetl.test.functional import TestBaseClass +from hshetl.cli import Controller +import sys +import filecmp + + +class TestCopyFileJob(TestBaseClass): + + example_dir_name = 'copy_file2file' + + def setUp(self): + super(TestCopyFileJob, self).setUp() + sys.argv = ['hshetl', '-d', self.test_working_directory] + + + def test_job(self): + Controller().configure().execute() + self.assertTrue(filecmp.cmp(self.test_working_directory + 'data/expected_output.csv', self.test_working_directory + 'output.csv')) diff --git a/hshetl/test/unit/test_extractors.py b/hshetl/test/unit/test_extractors.py index 55d061d7b3a6c4dbc4967d35dbc6eeef9743adb7..a0de53b112611677e98c35ca2e7cd594b47bcff3 100644 --- a/hshetl/test/unit/test_extractors.py +++ b/hshetl/test/unit/test_extractors.py @@ -1,7 +1,7 @@ import unittest import os from mock import Mock -from hshetl import extractors, connectors +from hshetl import extractors, connectors, Dialect from hshetl.test import unit as test from hshetl.exc import ExtractorException, ConfigurationException from sqlalchemy.engine import base as sqlalchemybase @@ -30,7 +30,7 @@ class TestCsvExtractor(unittest.TestCase): file_connector.connection = open(os.path.dirname(__file__) + '/../fixture/short_sample_semicolon.csv', 'r') file_connector.__enter__ = Mock(return_value = file_connector.connection) file_connector.__exit__ = Mock() - csv_extractor = extractors.CsvExtractor(delimiter = ';', connector = file_connector) + csv_extractor = extractors.CsvExtractor(dialect = Dialect(delimiter = ';'), connector = file_connector) result = csv_extractor.execute() self.assertEqual(result, test.fixtures['extractor_test_extractions_expected_result'])