From 0f20ddfa644fbcc7df56fabd25403366a0856a31 Mon Sep 17 00:00:00 2001 From: Fox-IT Security Research Team Date: Wed, 20 Jul 2022 11:29:44 +0200 Subject: [PATCH] The flow.record project --- .github/workflows/dissect-ci.yml | 7 + .gitignore | 11 + COPYRIGHT | 5 + LICENSE | 661 ++++++++++++++++++++ MANIFEST.in | 2 + README.md | 105 ++++ examples/filesystem.py | 108 ++++ examples/passivedns.py | 71 +++ examples/records.json | 2 + examples/tcpconn.py | 43 ++ flow/record/__init__.py | 79 +++ flow/record/adapter/__init__.py | 64 ++ flow/record/adapter/archive.py | 32 + flow/record/adapter/avro.py | 192 ++++++ flow/record/adapter/broker.py | 47 ++ flow/record/adapter/csvfile.py | 43 ++ flow/record/adapter/elastic.py | 43 ++ flow/record/adapter/jsonfile.py | 68 +++ flow/record/adapter/line.py | 37 ++ flow/record/adapter/mongo.py | 91 +++ flow/record/adapter/splunk.py | 82 +++ flow/record/adapter/stream.py | 51 ++ flow/record/adapter/text.py | 50 ++ flow/record/adapter/xlsx.py | 65 ++ flow/record/base.py | 807 +++++++++++++++++++++++++ flow/record/fieldtypes/__init__.py | 491 +++++++++++++++ flow/record/fieldtypes/credential.py | 9 + flow/record/fieldtypes/net/__init__.py | 15 + flow/record/fieldtypes/net/ip.py | 80 +++ flow/record/fieldtypes/net/ipv4.py | 137 +++++ flow/record/fieldtypes/net/tcp.py | 9 + flow/record/fieldtypes/net/udp.py | 9 + flow/record/jsonpacker.py | 101 ++++ flow/record/packer.py | 167 +++++ flow/record/selector.py | 714 ++++++++++++++++++++++ flow/record/stream.py | 293 +++++++++ flow/record/tools/__init__.py | 0 flow/record/tools/geoip.py | 194 ++++++ flow/record/tools/rdump.py | 169 ++++++ flow/record/utils.py | 87 +++ flow/record/whitelist.py | 40 ++ pyproject.toml | 9 + setup.cfg | 9 + setup.py | 26 + tests/__init__.py | 0 tests/selector_explain_example.py | 32 + tests/standalone_test.py | 16 + tests/test_compiled_selector.py | 37 ++ tests/test_fieldtype_ip.py | 238 ++++++++ tests/test_fieldtypes.py | 458 ++++++++++++++ tests/test_json_packer.py | 25 + tests/test_json_record_adapter.py | 71 +++ tests/test_packer.py | 216 +++++++ tests/test_rdump.py | 178 ++++++ tests/test_record.py | 613 +++++++++++++++++++ tests/test_record_adapter.py | 381 ++++++++++++ tests/test_record_descriptor.py | 142 +++++ tests/test_regression.py | 376 ++++++++++++ tests/test_selector.py | 504 +++++++++++++++ tests/test_splunk_adapter.py | 112 ++++ tests/utils_inspect.py | 58 ++ tox.ini | 58 ++ 62 files changed, 8840 insertions(+) create mode 100644 .github/workflows/dissect-ci.yml create mode 100644 .gitignore create mode 100644 COPYRIGHT create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 README.md create mode 100644 examples/filesystem.py create mode 100644 examples/passivedns.py create mode 100644 examples/records.json create mode 100644 examples/tcpconn.py create mode 100644 flow/record/__init__.py create mode 100644 flow/record/adapter/__init__.py create mode 100644 flow/record/adapter/archive.py create mode 100644 flow/record/adapter/avro.py create mode 100644 flow/record/adapter/broker.py create mode 100644 flow/record/adapter/csvfile.py create mode 100644 flow/record/adapter/elastic.py create mode 100644 flow/record/adapter/jsonfile.py create mode 100644 flow/record/adapter/line.py create mode 100644 flow/record/adapter/mongo.py create mode 100644 flow/record/adapter/splunk.py create mode 100644 flow/record/adapter/stream.py create mode 100644 flow/record/adapter/text.py create mode 100644 flow/record/adapter/xlsx.py create mode 100644 flow/record/base.py create mode 100644 flow/record/fieldtypes/__init__.py create mode 100644 flow/record/fieldtypes/credential.py create mode 100644 flow/record/fieldtypes/net/__init__.py create mode 100644 flow/record/fieldtypes/net/ip.py create mode 100644 flow/record/fieldtypes/net/ipv4.py create mode 100644 flow/record/fieldtypes/net/tcp.py create mode 100644 flow/record/fieldtypes/net/udp.py create mode 100644 flow/record/jsonpacker.py create mode 100644 flow/record/packer.py create mode 100644 flow/record/selector.py create mode 100644 flow/record/stream.py create mode 100644 flow/record/tools/__init__.py create mode 100644 flow/record/tools/geoip.py create mode 100644 flow/record/tools/rdump.py create mode 100644 flow/record/utils.py create mode 100644 flow/record/whitelist.py create mode 100644 pyproject.toml create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 tests/__init__.py create mode 100644 tests/selector_explain_example.py create mode 100644 tests/standalone_test.py create mode 100644 tests/test_compiled_selector.py create mode 100644 tests/test_fieldtype_ip.py create mode 100644 tests/test_fieldtypes.py create mode 100644 tests/test_json_packer.py create mode 100644 tests/test_json_record_adapter.py create mode 100644 tests/test_packer.py create mode 100644 tests/test_rdump.py create mode 100644 tests/test_record.py create mode 100644 tests/test_record_adapter.py create mode 100644 tests/test_record_descriptor.py create mode 100644 tests/test_regression.py create mode 100644 tests/test_selector.py create mode 100644 tests/test_splunk_adapter.py create mode 100644 tests/utils_inspect.py create mode 100644 tox.ini diff --git a/.github/workflows/dissect-ci.yml b/.github/workflows/dissect-ci.yml new file mode 100644 index 0000000..4602eeb --- /dev/null +++ b/.github/workflows/dissect-ci.yml @@ -0,0 +1,7 @@ +name: Dissect CI +on: [push, pull_request, workflow_dispatch] + +jobs: + ci: + uses: fox-it/dissect-workflow-templates/.github/workflows/dissect-ci-template-self-hosted.yml@main + secrets: inherit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a89302b --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +coverage.xml +.coverage +dist/ +.eggs/ +*.egg-info/ +*.pyc +__pycache__/ +.pytest_cache/ +.tox/ + +flow/record/version.py diff --git a/COPYRIGHT b/COPYRIGHT new file mode 100644 index 0000000..c055a21 --- /dev/null +++ b/COPYRIGHT @@ -0,0 +1,5 @@ +Dissect is released as open source by Fox-IT (https://www.fox-it.com) part of NCC Group Plc (https://www.nccgroup.com) + +Developed by the Dissect Team (dissect@fox-it.com) and made available at https://github.com/fox-it/flow.record + +License terms: AGPL3 (https://www.gnu.org/licenses/agpl-3.0.html) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..be3f7b2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..4b4dd26 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +exclude .gitignore +exclude .github diff --git a/README.md b/README.md new file mode 100644 index 0000000..ef882ba --- /dev/null +++ b/README.md @@ -0,0 +1,105 @@ +# flow.record + +A library for defining and creating structured data (called records) that can be streamed to disk or piped to other +tools that use `flow.record`. + +Records can be read and transformed to other formats by using output adapters, such as CSV and JSON. + +For more information on how Dissect uses this library, please see [the +documentation](https://dissect.readthedocs.io/en/latest/tools/rdump.html#what-is-a-record). + +## Usage + +This library contains the tool `rdump`. With `rdump` you can read, write, interact, and manipulate records from `stdin` +or from record files saved on disk. Please refer to `rdump -h` or to the [`rdump` +documentation](https://dissect.readthedocs.io/en/latest/tools/rdump.html) for all parameters. + +Records are the primary output type when using the various functions of `target-query`. The following command shows how +to pipe record output from `target-query` to `rdump`: + +```shell +user@dissect~$ target-query -f runkeys targets/EXAMPLE.vmx | rdump + +<...> +``` + +## Programming example + +Define a `RecordDescriptor` (schema) and then create a few records and write them to disk + +```python +from flow.record import RecordDescriptor, RecordWriter + +# define our descriptor +MyRecord = RecordDescriptor("my/record", [ + ("net.ipaddress", "ip"), + ("string", "description"), +]) + +# define some records +records = [ + MyRecord("1.1.1.1", "cloudflare dns"), + MyRecord("8.8.8.8", "google dns"), +] + +# write the records to disk +with RecordWriter("output.records.gz") as writer: + for record in records: + writer.write(record) +``` + +The records can then be read from disk using the `rdump` tool or by instantiating a `RecordReader` when using the +library. + +```shell +$ rdump output.records.gz + + +``` + +### Selectors + +We can also use `selectors` for filtering and selecting records using a query (Python like syntax), e.g.: + +```shell +$ rdump output.records.gz -s '"google" in r.description' + + +$ rdump output.records.gz -s 'r.ip in net.ipnetwork("1.1.0.0/16")' + +``` + +## Build and test instructions + +This project uses `tox` to build source and wheel distributions. Run the following command from the root folder to build +these: + +```bash +tox -e build +``` + +The build artifacts can be found in the `dist/` directory. + +`tox` is also used to run linting and unit tests in a self-contained environment. To run both linting and unit tests +using the default installed Python version, run: + +```bash +tox +``` + +For a more elaborate explanation on how to build and test the project, please see [the +documentation](https://dissect.readthedocs.io/en/latest/contributing/developing.html#building-testing). + +## Contributing + +The Dissect project encourages any contribution to the codebase. To make your contribution fit into the project, please +refer to [the style guide](https://dissect.readthedocs.io/en/latest/contributing/style-guide.html). + +## Copyright and license + +Dissect is released as open source by Fox-IT () part of NCC Group Plc +(). + +Developed by the Dissect Team () and made available at . + +License terms: AGPL3 (). For more information, see the LICENSE file. diff --git a/examples/filesystem.py b/examples/filesystem.py new file mode 100644 index 0000000..a8f5524 --- /dev/null +++ b/examples/filesystem.py @@ -0,0 +1,108 @@ +import os +import stat + +from datetime import datetime + +from flow.record import RecordDescriptor, RecordWriter + +FilesystemFile = RecordDescriptor(""" +filesystem/unix/entry + string path; + varint inode; + varint dev; + unix_file_mode mode; + filesize size; + uint32 uid; + uint32 gid; + datetime ctime; + datetime mtime; + datetime atime; + string link; +""") + + +def hash_file(path, t): + f = open(path, "rb") + while 1: + d = f.read(4096) + if d == "": + break + f.close() + + +class FilesystemIterator: + basepath = None + + def __init__(self, basepath): + self.basepath = basepath + self.recordType = FilesystemFile + + def classify(self, source, classification): + self.recordType = FilesystemFile.base(_source=source, _classification=classification) + + def iter(self, path): + path = os.path.abspath(path) + return self._iter(path) + + def _iter(self, path): + if path.startswith("/proc"): + return + + st = os.lstat(path) + + abspath = path + if self.basepath and abspath.startswith(self.basepath): + abspath = abspath[len(self.basepath):] + + ifmt = stat.S_IFMT(st.st_mode) + + link = None + if ifmt == stat.S_IFLNK: + link = os.readlink(path) + + yield self.recordType( + path=abspath, + inode=int(st.st_ino), + dev=int(st.st_dev), + mode=st.st_mode, + size=st.st_size, + uid=st.st_uid, + gid=st.st_gid, + ctime=datetime.fromtimestamp(st.st_ctime), + mtime=datetime.fromtimestamp(st.st_mtime), + atime=datetime.fromtimestamp(st.st_atime), + link=link, + ) + + if ifmt == stat.S_IFDIR: + for i in os.listdir(path): + if i in (".", ".."): + continue + + fullpath = os.path.join(path, i) + for e in self.iter(fullpath): + yield e + +chunk = [] + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('target', metavar="TARGET", nargs="*") + parser.add_argument('-s', dest='source', help="Source") + parser.add_argument('-c', dest='classification', help="Classification") + parser.add_argument('-b', dest='base', help="Base directory") + + args = parser.parse_args() + + stream = RecordWriter() + + fsiter = FilesystemIterator(args.base) + + if args.source or args.classification: + fsiter.classify(args.source, args.classification) + + for path in args.target: + for r in fsiter.iter(path): + stream.write(r) diff --git a/examples/passivedns.py b/examples/passivedns.py new file mode 100644 index 0000000..be05359 --- /dev/null +++ b/examples/passivedns.py @@ -0,0 +1,71 @@ +#!/usr/bin/env pypy +import record +import sys +import datetime + +import net.ipv4 + +from fileprocessing import DirectoryProcessor + + +def ts(s): + return datetime.datetime.fromtimestamp(float(s)) + + +def ip(s): + return net.ipv4.Address(s) + + +class SeparatedFile: + fp = None + seperator = None + format = None + + def __init__(self, fp, seperator, format): + self.fp = fp + self.seperator = seperator + self.format = format + + def __iter__(self): + desc = record.RecordDescriptor([i[0] for i in PASSIVEDNS_FORMAT]) + recordtype = desc.recordType + + for l in self.fp: + p = l.strip().split(self.seperator) + + r = {} + for i in range(len(self.format)): + field = self.format[i] + + v = p[i] + if field[1]: + v = field[1](v) + + r[field[0]] = v + + yield recordtype(**r) + + +def PassiveDnsFile(fp): + return SeparatedFile(fp, "||", PASSIVEDNS_FORMAT) + +PASSIVEDNS_FORMAT = [ + ("ts", ts), + ("src", ip), + ("dst", ip), + ("family", None), + ("query", None), + ("query_type", None), + ("result", None), + ("ttl", int), + ("x", None), +] + + +def main(): + rs = record.RecordOutput(sys.stdout) + for r in DirectoryProcessor(sys.argv[1], PassiveDnsFile, r"\.log\.gz"): + rs.write(r) + +if __name__ == "__main__": + main() diff --git a/examples/records.json b/examples/records.json new file mode 100644 index 0000000..30a415a --- /dev/null +++ b/examples/records.json @@ -0,0 +1,2 @@ +{"_type": "recorddescriptor", "_data": ["text/paste", [["string", "key"], ["datetime", "date"], ["datetime", "expire_date"], ["wstring", "title"], ["wstring", "content"], ["wstring", "user"], ["wstring", "syntax"]]]} +{"_classification": "PUBLIC", "_generated": "2019-03-19T09:11:04.706581", "_source": "external/pastebin", "_type": "record", "_recorddescriptor": ["text/paste", 831446724], "_version": 1, "content": "This is the content of a sampe pastebin record", "date": "2019-03-19T09:09:47", "expire_date": "1970-01-01T00:00:00", "key": "Q42eWSaF", "syntax": "text", "title": "A sample pastebin record", "user": ""} diff --git a/examples/tcpconn.py b/examples/tcpconn.py new file mode 100644 index 0000000..46fa7c4 --- /dev/null +++ b/examples/tcpconn.py @@ -0,0 +1,43 @@ +import random + +from datetime import datetime +from flow import record + +conn = record.RecordDescriptor(""" +network/traffic/tcp/connection + datetime ts; + net.ipv4.Address src; + net.tcp.Port srcport; + net.ipv4.Address dst; + net.tcp.Port dstport; +""") + +ip_list = [ + "127.0.0.1", + "1.2.3.4", + "212.33.1.45", + "4.4.4.4", + "8.8.8.8", + "212.1.6.1", +] + +port_list = [ + 22, + 53, + 80, + 443, + 5555 +] + +rs = record.RecordWriter() + +for i in range(500): + r = conn( + ts=datetime.now(), + src=random.choice(ip_list), + srcport=random.choice(port_list), + dst=random.choice(ip_list), + dstport=random.choice(port_list) + ) + + rs.write(r) diff --git a/flow/record/__init__.py b/flow/record/__init__.py new file mode 100644 index 0000000..1d29015 --- /dev/null +++ b/flow/record/__init__.py @@ -0,0 +1,79 @@ +import os + +import gzip + +from flow.record.base import ( + RECORD_VERSION, + FieldType, + Record, + GroupedRecord, + RecordDescriptor, + RecordAdapter, + RecordField, + RecordReader, + RecordWriter, + open_path, + stream, + extend_record, + dynamic_fieldtype, + DynamicDescriptor, + RecordDescriptorError, +) +from flow.record.jsonpacker import JsonRecordPacker +from flow.record.stream import ( + RecordOutput, + RecordPrinter, + RecordPacker, + RecordStreamWriter, + RecordStreamReader, + PathTemplateWriter, + RecordArchiver, + record_stream, +) + +__all__ = [ + 'RECORD_VERSION', 'FieldType', 'Record', 'GroupedRecord', + 'RecordDescriptor', 'RecordAdapter', 'RecordField', 'RecordReader', + 'RecordWriter', 'RecordOutput', 'RecordPrinter', 'RecordPacker', + 'JsonRecordPacker', 'RecordStreamWriter', 'RecordStreamReader', + 'open_path', 'stream', 'dynamic_fieldtype', 'DynamicDescriptor', + 'PathTemplateWriter', 'RecordArchiver', 'RecordDescriptorError', + 'record_stream', 'extend_record', +] + + +class View: + fields = None + + def __init__(self, fields): + self.fields = fields + + def __iter__(self, fields): + pass + + +class RecordDateSplitter: + basepath = None + out = None + + def __init__(self, basepath): + self.basepath = basepath + self.out = {} + + def getstream(self, t): + if t not in self.out: + path = os.path.join(self.basepath, "-".join(["{:2d}".format(v) for v in t]) + ".rec.gz") + f = gzip.GzipFile(path, "wb") + rs = RecordStreamWriter(f) + self.out[t] = rs + return self.out[t] + + def write(self, r): + t = (r.ts.year, r.ts.month, r.ts.day) + rs = self.getstream(t) + rs.write(r) + rs.fp.flush() + + def close(self): + for rs in self.out.values(): + rs.close() diff --git a/flow/record/adapter/__init__.py b/flow/record/adapter/__init__.py new file mode 100644 index 0000000..f244376 --- /dev/null +++ b/flow/record/adapter/__init__.py @@ -0,0 +1,64 @@ +__path__ = __import__('pkgutil').extend_path(__path__, __name__) # make this namespace extensible from other packages +import abc + + +def with_metaclass(meta, *bases): + """Create a base class with a metaclass. Python 2 and 3 compatible.""" + # This requires a bit of explanation: the basic idea is to make a dummy + # metaclass for one level of class instantiation that replaces itself with + # the actual metaclass. + class metaclass(type): + + def __new__(cls, name, this_bases, d): + return meta(name, bases, d) + + @classmethod + def __prepare__(cls, name, this_bases): + return meta.__prepare__(name, bases) + return type.__new__(metaclass, 'temporary_class', (), {}) + + +class AbstractWriter(with_metaclass(abc.ABCMeta, object)): + + @abc.abstractmethod + def write(self, rec): + """Write a record.""" + raise NotImplementedError + + @abc.abstractmethod + def flush(self): + """Flush any buffered writes.""" + raise NotImplementedError + + @abc.abstractmethod + def close(self): + """Close the Writer, no more writes will be possible.""" + raise NotImplementedError + + def __del__(self): + self.close() + + def __enter__(self): + return self + + def __exit__(self, *args): + self.flush() + self.close() + + +class AbstractReader(with_metaclass(abc.ABCMeta, object)): + + @abc.abstractmethod + def __iter__(self): + """Return a record iterator.""" + raise NotImplementedError + + def close(self): + """Close the Reader, can be overriden to properly free resources.""" + pass + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() diff --git a/flow/record/adapter/archive.py b/flow/record/adapter/archive.py new file mode 100644 index 0000000..0086282 --- /dev/null +++ b/flow/record/adapter/archive.py @@ -0,0 +1,32 @@ +from flow.record.adapter import AbstractReader, AbstractWriter +from flow.record.stream import RecordArchiver + + +class ArchiveWriter(AbstractWriter): + writer = None + + def __init__(self, path, **kwargs): + self.path = path + + path_template = kwargs.get("path_template") + name = kwargs.get("name") + + self.writer = RecordArchiver(self.path, path_template=path_template, name=name) + + def write(self, r): + self.writer.write(r) + + def flush(self): + # RecordArchiver already flushes after every write + pass + + def close(self): + if self.writer: + self.writer.close() + self.writer = None + + +class ArchiveReader(AbstractReader): + + def __init__(self, path, **kwargs): + raise NotImplementedError diff --git a/flow/record/adapter/avro.py b/flow/record/adapter/avro.py new file mode 100644 index 0000000..c870d72 --- /dev/null +++ b/flow/record/adapter/avro.py @@ -0,0 +1,192 @@ +import json +from importlib.util import find_spec +from datetime import datetime, timedelta, timezone + +import fastavro + +from flow import record +from flow.record.utils import is_stdout +from flow.record.selector import make_selector +from flow.record.adapter import AbstractReader, AbstractWriter + + +AVRO_TYPE_MAP = { + "boolean": "boolean", + "datetime": "long", + "filesize": "long", + "uint16": "int", + "uint32": "int", + "float": "float", + "string": "string", + "unix_file_mode": "long", + "varint": "long", + "wstring": "string", + "uri": "string", + "digest": "bytes", + "bytes": "bytes", +} + +RECORD_TYPE_MAP = { + "boolean": "boolean", + "int": "varint", + "long": "varint", + "float": "float", + "string": "string", + "bytes": "bytes", +} + +EPOCH = datetime(1970, 1, 1, tzinfo=timezone.utc) + + +class AvroWriter(AbstractWriter): + fp = None + writer = None + + def __init__(self, path, key=None, **kwargs): + self.fp = record.open_path(path, "wb") + + self.desc = None + self.schema = None + self.parsed_schema = None + self.writer = None + self.codec = 'snappy' if find_spec('snappy') else 'deflate' + + def write(self, r): + if not self.desc: + self.desc = r._desc + self.schema = descriptor_to_schema(self.desc) + self.parsed_schema = fastavro.parse_schema(self.schema) + self.writer = fastavro.write.Writer(self.fp, self.parsed_schema, codec=self.codec) + + if self.desc != r._desc: + raise Exception("Mixed record types") + + self.writer.write(r._packdict()) + + def flush(self): + if self.writer: + self.writer.flush() + + def close(self): + if self.fp and not is_stdout(self.fp): + self.fp.close() + self.fp = None + self.writer = None + + +class AvroReader(AbstractReader): + fp = None + + def __init__(self, path, selector=None, **kwargs): + self.fp = record.open_path(path, "rb") + self.selector = make_selector(selector) + + self.reader = fastavro.reader(self.fp) + self.schema = self.reader.schema + if not self.schema: + raise Exception("Missing Avro schema") + + self.desc = schema_to_descriptor(self.schema) + + # Store the fieldnames that are of type "datetime" + self.datetime_fields = set( + name + for name, field in self.desc.get_all_fields().items() + if field.typename == "datetime" + ) + + def __iter__(self): + for obj in self.reader: + # Convert timestamp-micros fields back to datetime fields + for field_name in self.datetime_fields: + value = obj.get(field_name, None) + if isinstance(value, (int, float)) and value > 0xffffffff: + obj[field_name] = EPOCH + timedelta(microseconds=value) + + rec = self.desc.recordType(**obj) + if not self.selector or self.selector.match(rec): + yield rec + + def close(self): + if self.fp: + self.fp.close() + self.fp = None + + +def descriptor_to_schema(desc): + namespace, _, name = desc.name.rpartition("/") + schema = { + "type": "record", + "namespace": namespace, + "name": name, + "doc": json.dumps(desc._pack()), + "fields": [], + } + + fields = [] + for rf in desc.get_all_fields().values(): + field_name = rf.name + field_type = rf.typename + field_schema = { + "name": field_name, + } + + if field_type == "datetime": + field_schema["type"] = [{"type": "long", "logicalType": "timestamp-micros"}, {"type": "null"}] + else: + avro_type = AVRO_TYPE_MAP.get(field_type) + if not avro_type: + raise Exception("Unsupported Avro type: {}".format(field_type)) + + field_schema["type"] = [avro_type, "null"] + + fields.append(field_schema) + + schema["fields"] = fields + return schema + + +def schema_to_descriptor(schema): + doc = schema.get("doc") + + # Sketchy record descriptor detection + if doc and doc.startswith("[\"") and doc.endswith("]]]"): + name, fields = json.loads(doc) + else: + # No embedded record descriptor, attempt to generate one from the schema + name = "/".join([schema.get("namespace", ""), schema.get("name", "")]).replace(".", "/").strip("/") + fields = [] + + for f in schema.get("fields", []): + field_name = f["name"] + if field_name.startswith("_"): + continue + + field_type = avro_type_to_flow_type(f["type"]) + fields.append([field_type, field_name]) + + return record.RecordDescriptor(name, fields) + + +def avro_type_to_flow_type(ftype): + ftypes = [ftype] if not isinstance(ftype, list) else ftype + + # If a field can be null, it has an additional type of "null" + # So iterate over all the types, and break when we have a valid one + for t in ftypes: + if isinstance(t, dict): + if t.get("type") == "array": + item_type = avro_type_to_flow_type(t.get("items")) + return "{}[]".format(item_type) + else: + logical_type = t.get("logicalType") + if logical_type and "time" in logical_type or "date" in logical_type: + return "datetime" + + if t == "null": + continue + + if t in RECORD_TYPE_MAP: + return RECORD_TYPE_MAP[t] + + raise TypeError("Can't map avro type to flow type: {}".format(t)) diff --git a/flow/record/adapter/broker.py b/flow/record/adapter/broker.py new file mode 100644 index 0000000..6a2dfaf --- /dev/null +++ b/flow/record/adapter/broker.py @@ -0,0 +1,47 @@ +from flow.record.adapter import AbstractWriter, AbstractReader +from flow.broker import Publisher, Subscriber + + +class BrokerWriter(AbstractWriter): + publisher = None + + def __init__(self, uri, source=None, classification=None, **kwargs): + self.publisher = Publisher(uri, **kwargs) + self.source = source + self.classification = classification + + def write(self, r): + record = r._replace( + _source=self.source or r._source, + _classification=self.classification or r._classification, + ) + self.publisher.send(record) + + def flush(self): + if self.publisher: + self.publisher.flush() + + def close(self): + if self.publisher: + if hasattr(self.publisher, "stop"): + # Requires flow.broker >= 1.1.1 + self.publisher.stop() + else: + self.publisher.wait() + self.publisher = None + + +class BrokerReader(AbstractReader): + subscriber = None + + def __init__(self, uri, name=None, selector=None, **kwargs): + self.subscriber = Subscriber(uri, **kwargs) + self.subscription = self.subscriber.select(name, str(selector)) + + def __iter__(self): + return iter(self.subscription) + + def close(self): + if self.subscriber: + self.subscriber.stop() + self.subscriber = None diff --git a/flow/record/adapter/csvfile.py b/flow/record/adapter/csvfile.py new file mode 100644 index 0000000..cbb6622 --- /dev/null +++ b/flow/record/adapter/csvfile.py @@ -0,0 +1,43 @@ +from __future__ import absolute_import + +import sys +from csv import DictWriter + +from flow.record import open_path +from flow.record.utils import is_stdout +from flow.record.adapter import AbstractWriter + + +class CsvfileWriter(AbstractWriter): + fp = None + + def __init__(self, path, fields=None, exclude=None, **kwargs): + mode = "w" + if sys.version_info[0] < 3: + mode = "wb" + self.fp = open_path(path, mode) + self.desc = None + self.writer = None + self.fields = fields + self.exclude = exclude + if isinstance(self.fields, str): + self.fields = self.fields.split(",") + if isinstance(self.exclude, str): + self.exclude = self.exclude.split(",") + + def write(self, r): + rdict = r._asdict(fields=self.fields, exclude=self.exclude) + if not self.desc or self.desc != r._desc: + self.desc = r._desc + self.writer = DictWriter(self.fp, rdict) + self.writer.writeheader() + self.writer.writerow(rdict) + + def flush(self): + if self.fp: + self.fp.flush() + + def close(self): + if self.fp and not is_stdout(self.fp): + self.fp.close() + self.fp = None diff --git a/flow/record/adapter/elastic.py b/flow/record/adapter/elastic.py new file mode 100644 index 0000000..38c1b1c --- /dev/null +++ b/flow/record/adapter/elastic.py @@ -0,0 +1,43 @@ +import elasticsearch +import elasticsearch.helpers + +from flow.record.adapter import AbstractWriter, AbstractReader + + +def index_stream(index, it): + for r in it: + d = r.dict() + if "Value" in d: + del d["Value"] + + yield { + "_index": index, + "_type": "event_" + str(d["EventID"]), + "_source": d, + } + + +class ElasticWriter(AbstractWriter): + + def __init__(self, index, **kwargs): + self.index = index + + self.es = elasticsearch.Elasticsearch() + + # def writeblob(self, src): + # count = elasticsearch.helpers.bulk(es, index_stream("logtest", src)) + + def write(self, r): + self.es.index({"_index": self.index, "_type": r._desc.name, "_source": r.dict()}) + + def flush(self): + pass + + def close(self): + pass + + +class ElasticReader(AbstractReader): + + def __iter__(self, r, **kwargs): + raise NotImplementedError() diff --git a/flow/record/adapter/jsonfile.py b/flow/record/adapter/jsonfile.py new file mode 100644 index 0000000..16ab985 --- /dev/null +++ b/flow/record/adapter/jsonfile.py @@ -0,0 +1,68 @@ +import json +from flow import record +from flow.record import JsonRecordPacker +from flow.record.utils import is_stdout +from flow.record.selector import make_selector +from flow.record.adapter import AbstractWriter, AbstractReader +from flow.record.fieldtypes import fieldtype_for_value + + +class JsonfileWriter(AbstractWriter): + fp = None + + def __init__(self, path, indent=None, **kwargs): + self.fp = record.open_path(path, "w") + if isinstance(indent, str): + indent = int(indent) + self.packer = JsonRecordPacker(indent=indent) + self.packer.on_descriptor.add_handler(self.packer_on_new_descriptor) + + def packer_on_new_descriptor(self, descriptor): + self._write(descriptor) + + def _write(self, obj): + record_json = self.packer.pack(obj) + self.fp.write(record_json + u"\n") + + def write(self, r): + self._write(r) + + def flush(self): + if self.fp: + self.fp.flush() + + def close(self): + if self.fp and not is_stdout(self.fp): + self.fp.close() + self.fp = None + + +class JsonfileReader(AbstractReader): + fp = None + + def __init__(self, path, selector=None, **kwargs): + self.selector = make_selector(selector) + self.fp = record.open_path(path, "r") + self.packer = JsonRecordPacker() + + def close(self): + if self.fp: + self.fp.close() + self.fp = None + + def __iter__(self): + for line in self.fp: + obj = self.packer.unpack(line) + if isinstance(obj, record.Record): + if not self.selector or self.selector.match(obj): + yield obj + elif isinstance(obj, record.RecordDescriptor): + pass + else: + # fallback for plain jsonlines (non flow.record format) + jd = json.loads(line) + fields = [(fieldtype_for_value(val, "string"), key) for key, val in jd.items()] + desc = record.RecordDescriptor("json/record", fields) + obj = desc(**jd) + if not self.selector or self.selector.match(obj): + yield obj diff --git a/flow/record/adapter/line.py b/flow/record/adapter/line.py new file mode 100644 index 0000000..b38f906 --- /dev/null +++ b/flow/record/adapter/line.py @@ -0,0 +1,37 @@ +from flow.record.adapter import AbstractWriter +from flow.record import open_path +from flow.record.utils import is_stdout + + +class LineWriter(AbstractWriter): + """Prints all fields and values of the Record on a separate line.""" + + fp = None + + def __init__(self, path, fields=None, exclude=None, **kwargs): + self.fp = open_path(path, "wb") + self.count = 0 + self.fields = fields + self.exclude = exclude + if isinstance(self.fields, str): + self.fields = self.fields.split(",") + if isinstance(self.exclude, str): + self.exclude = self.exclude.split(",") + + def write(self, rec): + rdict = rec._asdict(fields=self.fields, exclude=self.exclude) + self.count += 1 + self.fp.write("--[ RECORD {} ]--\n".format(self.count).encode()) + if rdict: + fmt = "{{:>{width}}} = {{}}\n".format(width=max(len(k) for k in rdict)) + for (key, value) in rdict.items(): + self.fp.write(fmt.format(key, value).encode()) + + def flush(self): + if self.fp: + self.fp.flush() + + def close(self): + if self.fp and not is_stdout(self.fp): + self.fp.close() + self.fp = None diff --git a/flow/record/adapter/mongo.py b/flow/record/adapter/mongo.py new file mode 100644 index 0000000..69c34c5 --- /dev/null +++ b/flow/record/adapter/mongo.py @@ -0,0 +1,91 @@ +import bson +from flow import record +from flow.record.adapter import AbstractReader, AbstractWriter +from flow.record.selector import make_selector +from pymongo import MongoClient + + +def parse_path(path): + elements = path.strip("/").split("/", 2) # max 3 elements + if len(elements) == 2: + return "localhost", elements[0], elements[1] + if len(elements) == 3: + return tuple(elements) + raise ValueError("Invalid mongo path") + + +class MongoWriter(AbstractWriter): + client = None + + def __init__(self, path, key=None, **kwargs): + dbhost, dbname, collection = parse_path(path) + + self.key = key + self.client = MongoClient(host=dbhost) + self.db = self.client[dbname] + self.collection = self.db[collection] + self.coll_descriptors = self.db["_descriptors"] + self.descriptors = {} + + def write(self, r): + d = r._packdict() + d["_type"] = r._desc.identifier + + if r._desc.identifier not in self.descriptors: + self.coll_descriptors.find_and_modify( + {"name": r._desc.identifier}, + {"name": r._desc.identifier, "descriptor": r._desc._pack()}, + upsert=True) + + if self.key: + # i = self.collection.replace({self.key: d[self.key]}, d) # PyMongo3 + self.collection.find_and_modify({self.key: d[self.key]}, d, upsert=True) # PyMongo2 + else: + self.collection.insert(d) + + def flush(self): + pass + + def close(self): + if self.client: + self.client.close() + self.client = None + + +class MongoReader(AbstractReader): + client = None + + def __init__(self, path, selector=None, **kwargs): + dbhost, dbname, collection = parse_path(path) + + self.selector = make_selector(selector) + self.client = MongoClient(host=dbhost) + self.db = self.client[dbname] + self.collection = self.db[collection] + self.coll_descriptors = self.db["_descriptors"] + self.descriptors = {} + + def close(self): + if self.client: + self.client.close() + self.client = None + + def __iter__(self): + desc = None + for r in self.collection.find(): + if r["_type"] not in self.descriptors: + packed_desc = self.coll_descriptors.find({"name": r["_type"]})[0]["descriptor"] + self.descriptors[r["_type"]] = record.RecordDescriptor(*packed_desc) + + desc = self.descriptors[r["_type"]] + + del r["_id"] + del r["_type"] + + for k in list(r.keys()): + if isinstance(r[k], bson.int64.Int64): + r[k] = int(r[k]) + + obj = desc(**r) + if not self.selector or self.selector.match(obj): + yield obj diff --git a/flow/record/adapter/splunk.py b/flow/record/adapter/splunk.py new file mode 100644 index 0000000..8d6c0de --- /dev/null +++ b/flow/record/adapter/splunk.py @@ -0,0 +1,82 @@ +import socket +import logging + +from flow.record.adapter import AbstractReader, AbstractWriter +from flow.record.utils import to_str, to_bytes, to_base64 + + +log = logging.getLogger(__package__) + +RESERVED_SPLUNK_FIELDS = set([ + '_indextime', + '_time', + 'index', + 'punct', + 'source', + 'sourcetype', + 'tag', +]) + + +def splunkify(record, tag=None): + ret = [] + + ret.append(f'type="{record._desc.name}"') + + if tag is None: + ret.append('rdtag=None') + else: + ret.append(f'rdtag="{tag}"') + + for field in record._desc.fields: + val = getattr(record, field) + if val is None: + ret.append(f'{field}=None') + else: + val = to_base64(val) if isinstance(val, bytes) else to_str(val) + val = val.replace('\\', '\\\\').replace('"', '\\"') + if field in RESERVED_SPLUNK_FIELDS: + field = f'rd_{field}' + ret.append(f'{field}="{val}"') + + return " ".join(ret) + + +class SplunkWriter(AbstractWriter): + sock = None + + def __init__(self, path, tag=None, **kwargs): + p = path.strip("/").split("/") + host, port = p[0].split(":") + port = int(port) + + self.tag = tag + self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.SOL_TCP) + self.sock.connect((host, port)) + self.descriptors = {} + self._warned = False + + def write(self, record): + if not self._warned and 'rdtag' in record._desc.fields: + self._warned = True + log.warning( + "Record has 'rdtag' field which conflicts with the Splunk adapter -- " + "Splunk output will have duplicate 'rdtag' fields", + ) + rec = splunkify(record, tag=self.tag) + data = to_bytes(rec) + b"\n" + self.sock.sendall(data) + + def flush(self): + pass + + def close(self): + if self.sock: + self.sock.close() + self.sock = None + + +class SplunkReader(AbstractReader): + + def __init__(self, path, selector=None, **kwargs): + raise NotImplementedError() diff --git a/flow/record/adapter/stream.py b/flow/record/adapter/stream.py new file mode 100644 index 0000000..c07ba4b --- /dev/null +++ b/flow/record/adapter/stream.py @@ -0,0 +1,51 @@ +from flow import record +from flow.record.utils import is_stdout +from flow.record.adapter import AbstractReader, AbstractWriter + + +class StreamWriter(AbstractWriter): + fp = None + stream = None + + def __init__(self, path, clobber=True, **kwargs): + self.fp = record.open_path(path, "wb", clobber=clobber) + self.stream = record.RecordOutput(self.fp) + + def write(self, r): + self.stream.write(r) + + def flush(self): + if self.stream and hasattr(self.stream, "flush"): + self.stream.flush() + if self.fp: + self.fp.flush() + + def close(self): + if self.stream: + self.stream.close() + self.stream = None + + if self.fp and not is_stdout(self.fp): + self.fp.close() + self.fp = None + + +class StreamReader(AbstractReader): + fp = None + stream = None + + def __init__(self, path, selector=None, **kwargs): + self.fp = record.open_path(path, "rb") + self.stream = record.RecordStreamReader(self.fp, selector=selector) + + def __iter__(self): + return iter(self.stream) + + def close(self): + if self.stream: + self.stream.close() + self.stream = None + + if self.fp: + self.fp.close() + self.fp = None diff --git a/flow/record/adapter/text.py b/flow/record/adapter/text.py new file mode 100644 index 0000000..1e8ce06 --- /dev/null +++ b/flow/record/adapter/text.py @@ -0,0 +1,50 @@ +from flow.record import open_path +from flow.record.utils import is_stdout +from flow.record.adapter import AbstractWriter + +REPLACE_LIST = [ + (r"\r", "\r"), + (r"\n", "\n"), + (r"\t", "\t"), +] + + +class DefaultMissing(dict): + def __missing__(self, key): + return key.join("{}") + + +class TextWriter(AbstractWriter): + """Records are printed as textual representation with repr() or using `format_spec`.""" + + fp = None + + def __init__(self, path, flush=True, format_spec=None, **kwargs): + self.fp = open_path(path, "wb") + self.auto_flush = flush + self.format_spec = format_spec + + # Allow some special characters in format template + if self.format_spec: + for old, new in REPLACE_LIST: + self.format_spec = self.format_spec.replace(old, new) + + def write(self, rec): + if self.format_spec: + buf = self.format_spec.format_map(DefaultMissing(rec._asdict())) + else: + buf = repr(rec) + self.fp.write(buf.encode() + b"\n") + + # because stdout is usually line buffered we force flush here if wanted + if self.auto_flush: + self.flush() + + def flush(self): + if self.fp: + self.fp.flush() + + def close(self): + if self.fp and not is_stdout(self.fp): + self.fp.close() + self.fp = None diff --git a/flow/record/adapter/xlsx.py b/flow/record/adapter/xlsx.py new file mode 100644 index 0000000..311af1d --- /dev/null +++ b/flow/record/adapter/xlsx.py @@ -0,0 +1,65 @@ +import openpyxl + +from flow import record +from flow.record.utils import is_stdout +from flow.record.selector import make_selector +from flow.record.adapter import AbstractWriter, AbstractReader + + +class XlsxWriter(AbstractWriter): + fp = None + wb = None + + def __init__(self, path, **kwargs): + self.fp = record.open_path(path, "wb") + self.wb = openpyxl.Workbook() + self.ws = self.wb.active + self.desc = None + # self.ws.title = "Records" + + def write(self, r): + if not self.desc: + self.desc = r._desc + self.ws.append(r._desc.fields) + + self.ws.append(r._asdict().values()) + + def flush(self): + if self.wb: + self.wb.save(self.fp) + + def close(self): + if self.wb: + self.wb.close() + self.wb = None + + if self.fp and not is_stdout(self.fp): + self.fp.close() + self.fp = None + + +class XlsxReader(AbstractReader): + fp = None + + def __init__(self, path, selector=None, **kwargs): + self.selector = make_selector(selector) + self.fp = record.open_path(path, "rb") + self.desc = None + self.wb = openpyxl.load_workbook(self.fp) + self.ws = self.wb.active + + def close(self): + if self.fp: + self.fp.close() + self.fp = None + + def __iter__(self): + desc = None + for row in self.ws.rows: + if not desc: + desc = record.RecordDescriptor([col.value.replace(" ", "_").lower() for col in row]) + continue + + obj = desc(*[col.value for col in row]) + if not self.selector or self.selector.match(obj): + yield obj diff --git a/flow/record/base.py b/flow/record/base.py new file mode 100644 index 0000000..f1730c2 --- /dev/null +++ b/flow/record/base.py @@ -0,0 +1,807 @@ +import importlib +import io +import re +import os +import sys +import gzip +import struct +import logging +import keyword +import hashlib +import functools +import collections +try: + # Python 2 + import urlparse +except ImportError: + # Python 3 + import urllib.parse as urlparse +try: + import lz4.frame as lz4 + HAS_LZ4 = True +except ImportError: + HAS_LZ4 = False +try: + import bz2 + HAS_BZ2 = True +except ImportError: + HAS_BZ2 = False +try: + import zstandard as zstd + HAS_ZSTD = True +except ImportError: + HAS_ZSTD = False + +from collections import OrderedDict +from operator import itemgetter as _itemgetter +from .whitelist import WHITELIST, WHITELIST_TREE +from .utils import to_str, to_native_str + +log = logging.getLogger(__package__) + +RECORD_VERSION = 1 +RESERVED_FIELDS = OrderedDict([ + ("_source", "string"), + ("_classification", "string"), + ("_generated", "datetime"), + # For compatibility reasons, always add new reserved fields BEFORE + # the _version field, but AFTER the second to last field + ("_version", "varint"), +]) + +# Compression Headers +GZIP_MAGIC = b"\x1f\x8b" +BZ2_MAGIC = b"BZh" +LZ4_MAGIC = b"\x04\x22\x4d\x18" +ZSTD_MAGIC = b"\x28\xb5\x2f\xfd" + +RE_VALID_FIELD_NAME = re.compile(r"^_?[a-zA-Z][a-zA-Z0-9_]*(?:\[\])?$") +RE_VALID_RECORD_TYPE_NAME = re.compile("^[a-zA-Z][a-zA-Z0-9_]*(/[a-zA-Z][a-zA-Z0-9_]*)*$") + +RECORD_CLASS_TEMPLATE = """ +from datetime import datetime +from itertools import zip_longest + +class {name}(Record): + _desc = desc + _field_types = {field_types} + + __slots__ = {slots_tuple} + + def __init__(__self, {args}): +{init_code} + + @classmethod + def _unpack(__cls, {args}): +{unpack_code} +""" + + +class Peekable: + """Wrapper class for adding .peek() to a file object.""" + + def __init__(self, fd): + self.fd = fd + self.buffer = None + + def peek(self, size): + if self.buffer is not None: + raise BufferError("Only 1 peek allowed") + data = self.fd.read(size) + self.buffer = io.BytesIO(data) + return data + + def read(self, size=None): + data = b"" + if self.buffer is None: + data = self.fd.read(size) + else: + data = self.buffer.read(size) + if len(data) < size: + data += self.fd.read(size - len(data)) + self.buffer = None + return data + + def close(self): + self.buffer = None + self.fd.close() + self.fd = None + + +class RecordDescriptorError(Exception): + pass + + +class FieldType: + + def _typename(self): + t = type(self) + t.__module__.split(".fieldtypes.")[1] + "." + t.__name__ + + @classmethod + def default(cls): + """Return the default value for the field in the Record template.""" + return None + + @classmethod + def _unpack(cls, data): + return data + + +class Record: + __slots__ = () + + def __eq__(self, other): + if not isinstance(other, Record): + return False + return self._pack() == other._pack() + + def _pack(self, unversioned=False): + values = [] + for k in self.__slots__: + v = getattr(self, k) + v = v._pack() if isinstance(v, FieldType) else v + + # Skip version field if requested (only for compatibility reasons) + if unversioned and k == "_version" and v == 1: + continue + else: + values.append(v) + + return self._desc.identifier, tuple(values) + + def _packdict(self): + return dict( + (k, v._pack() if isinstance(v, FieldType) else v) + for k, v in ((k, getattr(self, k)) for k in self.__slots__)) + + def _asdict(self, fields=None, exclude=None): + exclude = exclude or [] + if fields: + return OrderedDict((k, getattr(self, k)) for k in fields if k in self.__slots__ and k not in exclude) + return OrderedDict((k, getattr(self, k)) for k in self.__slots__ if k not in exclude) + + def __setattr__(self, k, v): + """Enforce setting the fields to their respective types.""" + # NOTE: This is a HOT code path + field_type = self._field_types.get(k) + if v is not None and k in self.__slots__ and field_type: + if not isinstance(v, field_type): + v = field_type(v) + super().__setattr__(k, v) + + def _replace(self, **kwds): + result = self.__class__(*map(kwds.pop, self.__slots__, (getattr(self, k) for k in self.__slots__))) + if kwds: + raise ValueError('Got unexpected field names: {kwds!r}'.format(kwds=list(kwds))) + return result + + def __repr__(self): + return "<{} {}>".format( + self._desc.name, + " ".join("{}={!r}".format(k, getattr(self, k)) for k in self._desc.fields)) + + +class GroupedRecord(Record): + """ + GroupedRecord acts like a normal Record, but can contain multiple records. + + See it as a flat Record view on top of multiple Records. + If two Records have the same fieldname, the first one will prevail. + """ + + def __init__(self, name, records): + super().__init__() + self.name = to_str(name) + self.records = [] + self.descriptors = [] + self.flat_fields = [] + + # to avoid recursion in __setattr__ and __getattr__ + self.__dict__["fieldname_to_record"] = OrderedDict() + + for rec in records: + if isinstance(rec, GroupedRecord): + for r in rec.records: + self.records.append(r) + self.descriptors.append(r._desc) + else: + self.records.append(rec) + self.descriptors.append(rec._desc) + + all_fields = rec._desc.get_all_fields() + required_fields = rec._desc.get_required_fields() + for field in all_fields.values(): + fname = field.name + if fname in self.fieldname_to_record: + continue + self.fieldname_to_record[fname] = rec + if fname not in required_fields: + self.flat_fields.append(field) + # flat descriptor to maintain compatibility with Record + + self._desc = RecordDescriptor(self.name, [(f.typename, f.name) for f in self.flat_fields]) + + def get_record_by_type(self, type_name): + """ + Get record in a GroupedRecord by type_name. + + Args: + type_name (str): The record type name (for example wq/meta). + + Returns: + None or the record + + """ + for record in self.records: + if record._desc.name == type_name: + return record + return None + + def _asdict(self, fields=None, exclude=None): + exclude = exclude or [] + keys = self.fieldname_to_record.keys() + if fields: + return OrderedDict((k, getattr(self, k)) for k in fields if k in keys and k not in exclude) + return OrderedDict((k, getattr(self, k)) for k in keys if k not in exclude) + + def __repr__(self): + return "<{} {}>".format(self.name, self.records) + + def __setattr__(self, attr, val): + if attr in getattr(self, "fieldname_to_record", {}): + x = self.fieldname_to_record.get(attr) + return setattr(x, attr, val) + return object.__setattr__(self, attr, val) + + def __getattr__(self, attr): + x = self.__dict__.get("fieldname_to_record", {}).get(attr) + if x: + return getattr(x, attr) + raise AttributeError(attr) + + def _pack(self): + return ( + self.name, + tuple(record._pack() for record in self.records), + ) + + def _replace(self, **kwds): + new_records = [] + for record in self.records: + new_records.append( + record.__class__(*map(kwds.pop, record.__slots__, (getattr(self, k) for k in record.__slots__))) + ) + if kwds: + raise ValueError('Got unexpected field names: {kwds!r}'.format(kwds=list(kwds))) + return GroupedRecord(self.name, new_records) + + +def is_valid_field_name(name, check_reserved=True): + if check_reserved: + if name in RESERVED_FIELDS: + return False + else: + if name in RESERVED_FIELDS: + return True + + if name.startswith("_"): + return False + + if not RE_VALID_FIELD_NAME.match(name): + return False + + return True + + +def parse_def(definition): + record_type = None + fields = [] + for line in definition.split("\n"): + line = line.strip() + + if not line: + continue + + if not record_type: + record_type = line + else: + _type, name = re.split(r"\s+", line.rstrip(";")) + + fields.append((_type, name)) + + return record_type, fields + + +class RecordField: + name = None + typename = None + type = None + + def __init__(self, name, typename): + if not is_valid_field_name(name, check_reserved=False): + raise RecordDescriptorError("Invalid field name: {}".format(name)) + + self.name = to_str(name) + self.typename = to_str(typename) + + self.type = fieldtype(typename) + + def __repr__(self): + return "".format(self.name, self.typename) + + +class RecordFieldSet(list): + pass + + +class RecordDescriptor: + name = None + fields = None + recordType = None + _desc_hash = None + + def __init__(self, name, fields=None): + name = to_str(name) + + if isinstance(fields, RecordDescriptor): + # Clone fields + fields = fields.get_field_tuples() + elif not fields: + name, fields = parse_def(name) + + fields = list([(to_native_str(k), to_str(v)) for k, v in fields]) + + contains_keyword = False + for fieldtype, fieldname in fields: + if not is_valid_field_name(fieldname): + raise RecordDescriptorError("Field '{}' is an invalid or reserved field name.".format(fieldname)) + + # Reserved Python keywords are allowed as field names, but at a cost. + # When a Python keyword is used as a field name, you can't use it as a kwarg anymore + # You'll be forced to either use *args or a expanding a dict to kwargs to initialize a record + # E.g. Record('from_value', 'and_value') or Record(**{'from': 1, 'and': 2}) + # You'll also only be able to get or set reserved attributes using getattr or setattr. + # Record initialization will also be slower, due to a different (slower) implementation + # that is compatible with this method of initializing records. + if keyword.iskeyword(fieldname): + contains_keyword = True + + self.fields = OrderedDict([(n, RecordField(n, _type)) for _type, n in fields]) + all_fields = self.get_all_fields() + self.name = name + + if not RE_VALID_RECORD_TYPE_NAME.match(name): + raise RecordDescriptorError("Invalid record type name") + + args = "" + init_code = "" + unpack_code = "" + + if len(all_fields) >= 255 and not (sys.version_info >= (3, 7)) or contains_keyword: + args = "*args, **kwargs" + init_code = ( + "\t\tfor k, v in zip_longest(__self.__slots__, args):\n" + + "\t\t\tsetattr(__self, k, kwargs.get(k, v))\n" + + "\t\t_generated = __self._generated\n") + unpack_code = ( + "\t\tvalues = dict([(f, __cls._field_types[f]._unpack(kwargs.get(f, v)) " + + "if kwargs.get(f, v) is not None else None) for f, v in zip_longest(__cls.__slots__, args)])\n" + + "\t\treturn __cls(**values)") + else: + args = ", ".join(["{}=None".format(k) for k in all_fields]) + unpack_code = "\t\treturn __cls(\n" + for field in all_fields.values(): + if field.type.default == FieldType.default: + default = FieldType.default() + else: + default = "_field_{field.name}.type.default()".format(field=field) + init_code += "\t\t__self.{field} = {field} if {field} is not None else {default}\n".format( + field=field.name, default=default) + unpack_code += ( + "\t\t\t{field} = _field_{field}.type._unpack({field}) " + + "if {field} is not None else {default},\n").format( + field=field.name, default=default) + unpack_code += "\t\t)" + + init_code += "\t\t__self._generated = _generated or datetime.utcnow()\n\t\t__self._version = RECORD_VERSION" + # Store the fieldtypes so we can enforce them in __setattr__() + field_types = "{\n" + for field in all_fields: + field_types += "\t\t{field!r}: _field_{field}.type,\n".format(field=field) + field_types += "\t}" + + code = RECORD_CLASS_TEMPLATE.format( + name=name.replace("/", "_"), + args=args, + slots_tuple=tuple(all_fields.keys()), + init_code=init_code, + unpack_code=unpack_code, + field_types=field_types, + ) + + code = code.replace("\t", " ") + c = compile(code, "", "exec") + + data = { + "desc": self, "Record": Record, "OrderedDict": OrderedDict, + "_itemgetter": _itemgetter, "_property": property, + "RECORD_VERSION": RECORD_VERSION, + } + for field in all_fields.values(): + data["_field_{}".format(field.name)] = field + + exec(c, data) + + self.recordType = data[name.replace("/", "_")] + + self.identifier = (self.name, self.descriptor_hash) + + @staticmethod + def get_required_fields(): + """ + Get required fields. + + Returns: + OrderedDict + + """ + required_fields = OrderedDict([(k, RecordField(k, v)) for k, v in RESERVED_FIELDS.items()]) + return required_fields + + def get_all_fields(self): + """ + Get all fields including required meta fields. + + Returns: + OrderedDict + + """ + required_fields = self.get_required_fields() + fields = self.fields.copy() + fields.update(required_fields) + return fields + + def getfields(self, typename): + if isinstance(typename, DynamicFieldtypeModule): + name = typename.gettypename() + else: + name = typename + + return RecordFieldSet(field for field in self.fields.values() if field.typename == name) + + def __call__(self, *args, **kwargs): + return self.recordType(*args, **kwargs) + + def init_from_dict(self, rdict, raise_unknown=False): + """Create a new Record initialized with key, value pairs from `rdict`. + + If `raise_unknown=True` then fields on `rdict` that are unknown to this + RecordDescriptor will raise a TypeError exception due to initializing + with unknown keyword arguments. (default: False) + + Returns: + Record + + """ + + if not raise_unknown: + rdict = {k: v for k, v in rdict.items() if k in self.recordType.__slots__} + return self.recordType(**rdict) + + def init_from_record(self, record, raise_unknown=False): + """Create a new Record initialized with data from another `record`. + + If `raise_unknown=True` then fields on `record` that are unknown to this + RecordDescriptor will raise a TypeError exception due to initializing + with unknown keyword arguments. (default: False) + + Returns: + Record + + """ + return self.init_from_dict(record._asdict(), raise_unknown=raise_unknown) + + def extend(self, fields): + """Returns a new RecordDescriptor with the extended fields + + Returns: + RecordDescriptor + """ + new_fields = list(self.get_field_tuples()) + fields + return RecordDescriptor(self.name, new_fields) + + def get_field_tuples(self): + """Returns a tuple containing the (typename, name) tuples, eg: + + (('boolean', 'foo'), ('string', 'bar')) + + Returns: + tuple + """ + return tuple((self.fields[f].typename, self.fields[f].name) for f in self.fields) + + @staticmethod + @functools.lru_cache(maxsize=256) + def calc_descriptor_hash(name, fields): + """Calculate and return the (cached) descriptor hash as a 32 bit integer. + + The descriptor hash is the first 4 bytes of the sha256sum of the descriptor name and field names and types. + """ + h = hashlib.sha256(name.encode("utf-8")) + for (typename, name) in fields: + h.update(name.encode("utf-8")) + h.update(typename.encode("utf-8")) + return struct.unpack(">L", h.digest()[:4])[0] + + @property + def descriptor_hash(self): + """Returns the (cached) descriptor hash""" + if not self._desc_hash: + self._desc_hash = self.calc_descriptor_hash(self.name, self.get_field_tuples()) + return self._desc_hash + + def __hash__(self): + return hash((self.name, self.get_field_tuples())) + + def __eq__(self, other): + if isinstance(other, RecordDescriptor): + return self.name == other.name and self.get_field_tuples() == other.get_field_tuples() + return NotImplemented + + def __repr__(self): + return "".format(self.name, self.descriptor_hash) + + def definition(self, reserved=True): + """Return the RecordDescriptor as Python definition string. + + If `reserved` is True it will also return the reserved fields. + """ + fields = [] + for ftype in self.get_all_fields().values(): + if not reserved and ftype.name.startswith("_"): + continue + fields.append( + ' ("{ftype.typename}", "{ftype.name}"),'.format(ftype=ftype)) + fields_str = "\n".join(fields) + return 'RecordDescriptor("{}", [\n{}\n])'.format(self.name, fields_str) + + def base(self, **kwargs_sink): + def wrapper(**kwargs): + kwargs.update(kwargs_sink) + return self.recordType(**kwargs) + + return wrapper + + def _pack(self): + return self.name, [(field.typename, field.name) for field in self.fields.values()] + + @staticmethod + def _unpack(name, fields): + return RecordDescriptor(name, fields) + + +def DynamicDescriptor(name, fields): + return RecordDescriptor(name, [("dynamic", field) for field in fields]) + + +def open_path(path, mode, clobber=True): + """ + Open `path` using `mode` and returns a file object. + + It handles special cases if path is meant to be stdin or stdout. + And also supports compression based on extension or file header of stream. + + Args: + path (str): Filename or path to filename to open + mode (str): Could be "r", "rb" to open file for reading, "w", "wb" for writing + clobber (bool): Overwrite file if it already exists if `clobber=True`, else raises IOError. + + """ + binary = "b" in mode + fp = None + if mode in ("w", "wb"): + out = True + elif mode in ("r", "rb"): + out = False + else: + raise ValueError("mode string can only be 'r', 'rb', 'w', or 'wb', not {!r}".format(mode)) + + # check for stdin or stdout + is_stdio = path in (None, "", "-") + + # check if output path exists + if not is_stdio and not clobber and os.path.exists(path) and out: + raise IOError("Output file {!r} already exists, and clobber=False".format(path)) + + # check path extension for compression + if path: + if path.endswith(".gz"): + fp = gzip.GzipFile(path, mode) + elif path.endswith(".bz2"): + if not HAS_BZ2: + raise RuntimeError('bz2 python module not available') + fp = bz2.BZ2File(path, mode) + elif path.endswith(".lz4"): + if not HAS_LZ4: + raise RuntimeError('lz4 python module not available') + fp = lz4.open(path, mode) + elif path.endswith((".zstd", ".zst")): + if not HAS_ZSTD: + raise RuntimeError('zstandard python module not available') + if not out: + dctx = zstd.ZstdDecompressor() + fp = dctx.stream_reader(open(path, "rb")) + else: + cctx = zstd.ZstdCompressor() + fp = cctx.stream_writer(open(path, "wb")) + + # normal file or stdio for reading or writing + if not fp: + if is_stdio: + if binary: + fp = getattr(sys.stdout, "buffer", sys.stdout) if out else getattr(sys.stdin, "buffer", sys.stdin) + else: + fp = sys.stdout if out else sys.stdin + else: + fp = io.open(path, mode) + # check if we are reading a compressed stream + if not out and binary: + if not hasattr(fp, "peek"): + fp = Peekable(fp) + peek_data = fp.peek(4) + if peek_data[:2] == GZIP_MAGIC: + fp = gzip.GzipFile(fileobj=fp, mode=mode) + elif HAS_BZ2 and peek_data[:3] == BZ2_MAGIC: + fp = bz2.BZ2File(fp, mode=mode) + elif HAS_LZ4 and peek_data[:4] == LZ4_MAGIC: + fp = lz4.open(fp, mode=mode) + elif HAS_ZSTD and peek_data[:4] == ZSTD_MAGIC: + dctx = zstd.ZstdDecompressor() + fp = dctx.stream_reader(fp) + return fp + + +def RecordAdapter(url, out, selector=None, clobber=True): + url = url or "" + url = str(url) + + # Guess adapter based on extension + ext_to_adapter = { + ".avro": "avro", + ".json": "jsonfile", + } + _, ext = os.path.splitext(url) + + p = urlparse.urlparse(url, ext_to_adapter.get(ext, "stream")) + + if '+' in p.scheme: + adapter, sub_adapter = p.scheme.split("+", 1) + else: + adapter = p.scheme + sub_adapter = None + + mod = importlib.import_module("flow.record.adapter.{}".format(adapter)) + + clsname = ("{}Writer" if out else "{}Reader").format(adapter.title()) + + cls = getattr(mod, clsname) + arg_dict = dict(urlparse.parse_qsl(p.query)) + cls_url = p.netloc + p.path + if sub_adapter: + cls_url = sub_adapter + "://" + cls_url + + if not out and selector: + arg_dict["selector"] = selector + + if out: + arg_dict["clobber"] = clobber + + log.debug("Creating {!r} for {!r} with args {!r}".format(cls, url, arg_dict)) + return cls(cls_url, **arg_dict) + + +def RecordReader(url=None, selector=None): + return RecordAdapter(url, False, selector=selector) + + +def RecordWriter(url=None, clobber=True): + return RecordAdapter(url, True, clobber=clobber) + + +def stream(src, dst): + for r in src: + dst.write(r) + dst.flush() + + +def fieldtype(clspath): + if clspath.endswith('[]'): + origpath = clspath + clspath = clspath[:-2] + islist = True + else: + islist = False + + if clspath not in WHITELIST: + raise AttributeError("Invalid field type: {}".format(clspath)) + + p = clspath.rsplit(".", 1) + module_path = "flow.record.fieldtypes" + clsname = p.pop() + if p: + module_path += "." + p[0] + + mod = importlib.import_module(module_path) + + t = getattr(mod, clsname) + + if not issubclass(t, FieldType): + raise AttributeError("Field type does not derive from FieldType") + + if islist: + listtype = type(origpath, mod.typedlist.__bases__, dict(mod.typedlist.__dict__)) + listtype.__type__ = t + t = listtype + + return t + + +def extend_record(record, other_records, replace=False, name=None): + """Extend `record` with fields and values from `other_records`. + + Duplicate fields are ignored in `other_records` unless `replace=True`. + + Args: + record (Record): Initial Record we want to extend. + other_records (List[Record]): List of Records we use for extending/replacing. + replace (bool): if `True`, it will replace existing fields and values + in `record` from fields and values from `other_records`. Last record always wins. + name (str): rename the RecordDescriptor name to `name`. Otherwise, use name from + initial `record`. + """ + field_map = collections.OrderedDict( + (fname, ftype) for (ftype, fname) in record._desc.get_field_tuples() + ) + record_maps = [record._asdict()] + for other in other_records: + for (ftype, fname) in other._desc.get_field_tuples(): + if not replace and fname in field_map: + continue + field_map[fname] = ftype + record_maps.append(other._asdict()) + field_tuples = [(ftype, fname) for (fname, ftype) in field_map.items()] + ExtendedRecord = RecordDescriptor(name or record._desc.name, field_tuples) + if replace: + record_maps = record_maps[::-1] + return ExtendedRecord.init_from_dict(collections.ChainMap(*record_maps)) + + +class DynamicFieldtypeModule: + + def __init__(self, path=""): + self.path = path + + def __getattr__(self, path): + path = (self.path + "." if self.path else "") + path + + obj = WHITELIST_TREE + for p in path.split('.'): + if p not in obj: + raise AttributeError("Invalid field type: {}".format(path)) + obj = obj[p] + + return DynamicFieldtypeModule(path) + + def gettypename(self): + if fieldtype(self.path): + return self.path + + def __call__(self, *args, **kwargs): + t = fieldtype(self.path) + + return t(*args, **kwargs) + + +net = DynamicFieldtypeModule("net") +dynamic_fieldtype = DynamicFieldtypeModule() diff --git a/flow/record/fieldtypes/__init__.py b/flow/record/fieldtypes/__init__.py new file mode 100644 index 0000000..0a8fdcc --- /dev/null +++ b/flow/record/fieldtypes/__init__.py @@ -0,0 +1,491 @@ +import re +import math +import warnings + +import binascii +from binascii import a2b_hex, b2a_hex +from posixpath import basename, dirname + +from datetime import datetime as _dt, timedelta +from flow.record.base import FieldType + +try: + import urlparse +except ImportError: + import urllib.parse as urlparse + +RE_NORMALIZE_PATH = re.compile(r'[\\/]+') +NATIVE_UNICODE = isinstance(u'', str) + +string_type = str +varint_type = int +bytes_type = bytes +float_type = float + + +def fieldtype_for_value(value, default="string"): + """Returns fieldtype name derived from the value. Returns `default` if it cannot be derived. + + Args: + value: value to derive the fieldtype from + + Returns: + str: the field type name or `default` if it cannot be derived + + Examples: + >>> fieldtype_for_value("hello") + "string" + >>> fieldtype_for_value(1337) + "varint" + >>> fieldtype_for_value(object(), None) + None + """ + if isinstance(value, bytes_type): + return "bytes" + elif isinstance(value, string_type): + return "string" + elif isinstance(value, float_type): + return "float" + elif isinstance(value, bool): + return "boolean" + elif isinstance(value, (varint_type, int)): + return "varint" + elif isinstance(value, _dt): + return "datetime" + return default + + +class dynamic(FieldType): + + def __new__(cls, obj): + if isinstance(obj, FieldType): + # Already a flow field type + return obj + + elif isinstance(obj, bytes_type): + return bytes(obj) + + elif isinstance(obj, string_type): + return string(obj) + + elif isinstance(obj, bool): + # Must appear before int, because bool is a subclass of int + return boolean(obj) + + elif isinstance(obj, (varint_type, int)): + return varint(obj) + + elif isinstance(obj, _dt): + return datetime(obj) + + elif isinstance(obj, (list, tuple)): + return stringlist(obj) + + raise NotImplementedError("Unsupported type for dynamic fieldtype: {}".format(type(obj))) + + +class typedlist(list, FieldType): + + __type__ = None + + def __init__(self, values=None): + if not values: + values = [] + super(self.__class__, self).__init__(self._convert(values)) + + def _convert(self, values): + return [self.__type__(f) if not isinstance(f, self.__type__) else f for f in values] + + def _pack(self): + result = [] + for f in self: + if not isinstance(f, self.__type__): + # Dont pack records already, it's the job of RecordPacker to pack record fields. + # Otherwise unpacking will yield unexpected results (records that are not unpacked). + if self.__type__ == record: + r = f + else: + r = self.__type__(f)._pack() + result.append(r) + else: + r = f._pack() + result.append(r) + return result + + @classmethod + def _unpack(cls, data): + data = map(cls.__type__._unpack, data) + return cls(data) + + @classmethod + def default(cls): + """Override default so the field is always an empty list.""" + return cls() + + +class dictlist(list, FieldType): + + def _pack(self): + return self + + +class stringlist(list, FieldType): + + def _pack(self): + return self + + +class string(string_type, FieldType): + + def __new__(cls, value): + if isinstance(value, bytes_type): + value = cls._decode(value, "utf-8") + if isinstance(value, bytes_type): + # Still bytes, so decoding failed (Python 2) + return bytes(value) + return super().__new__(cls, value) + + def _pack(self): + return self + + @classmethod + def _decode(cls, data, encoding): + """Decode a byte-string into a unicode-string. + + Python 3: When `data` contains invalid unicode characters a `UnicodeDecodeError` is raised. + Python 2: When `data` contains invalid unicode characters the original byte-string is returned. + """ + if NATIVE_UNICODE: + # Raises exception on decode error + return data.decode(encoding) + try: + return data.decode(encoding) + except UnicodeDecodeError: + # Fallback to bytes (Python 2 only) + preview = data[:16].encode('hex_codec') + ('..' if len(data) > 16 else '') + warnings.warn("Got binary data in string field (hex: {}). Compatibility is not guaranteed.".format( + preview), RuntimeWarning) + return data + + +# Alias for backwards compatibility +wstring = string + + +class bytes(bytes_type, FieldType): + value = None + + def __init__(self, value): + if not isinstance(value, bytes_type): + raise TypeError("Value not of bytes type") + self.value = value + + def _pack(self): + return self.value + + def __repr__(self): + return repr(self.value) + + +class datetime(_dt, FieldType): + + def __new__(cls, *args, **kwargs): + if len(args) == 1 and not kwargs: + arg = args[0] + if isinstance(arg, bytes_type): + arg = arg.decode("utf-8") + if isinstance(arg, string_type): + # I expect ISO 8601 format e.g. datetime.isformat() + # When the microseconds part is 0, str(datetime) will not print the microsecond part (only seconds) + # So we have to account for this. + # String constructor is used for example in JsonRecordAdapter + if "." in arg: + return cls.strptime(arg, "%Y-%m-%dT%H:%M:%S.%f") + else: + return cls.strptime(arg, "%Y-%m-%dT%H:%M:%S") + elif isinstance(arg, (int,)): + return cls.utcfromtimestamp(arg) + elif isinstance(arg, (_dt,)): + return _dt.__new__( + cls, + arg.year, arg.month, arg.day, + arg.hour, arg.minute, arg.second, arg.microsecond, + arg.tzinfo) + + return _dt.__new__(cls, *args, **kwargs) + + def __eq__(self, other): + return self - other == timedelta(0) + + def _pack(self): + return self + + def __repr__(self): + result = str(self) + return result + + +class varint(varint_type, FieldType): + + def _pack(self): + return self + + +class float(float, FieldType): + + def _pack(self): + return self + + +class uint16(int, FieldType): + + value = None + + def __init__(self, value): + if value < 0 or value > 0xffff: + raise ValueError("Value not within (0x0, 0xffff), got: {}".format(value)) + + self.value = value + + def _pack(self): + return self.value + + def __repr__(self): + return str(self.value) + + +class uint32(int, FieldType): + value = None + + def __init__(self, value): + if value < 0 or value > 0xffffffff: + raise ValueError("Value not within (0x0, 0xffffffff), got {}".format(value)) + + self.value = value + + def _pack(self): + return self.value + + +class boolean(int, FieldType): + value = None + + def __init__(self, value): + if value < 0 or value > 1: + raise ValueError("Value not a valid boolean value") + + self.value = bool(value) + + def _pack(self): + return self.value + + def __str__(self): + return str(self.value) + + def __repr__(self): + return str(self.value) + + +def human_readable_size(x): + # hybrid of http://stackoverflow.com/a/10171475/2595465 + # with http://stackoverflow.com/a/5414105/2595465 + if x == 0: + return '0' + magnitude = int(math.log(abs(x), 10.24)) + if magnitude > 16: + format_str = '%iP' + # denominator_mag = 15 + else: + float_fmt = '%2.1f' if magnitude % 3 == 1 else '%1.2f' + illion = (magnitude + 1) // 3 + format_str = float_fmt + " " + [' ', 'K', 'M', 'G', 'T', 'P'][illion] + return (format_str % (x * 1.0 / (1024 ** illion))) + "B" + + +class filesize(varint): + + def __repr__(self): + return human_readable_size(self) + + +class unix_file_mode(varint): + + def __repr__(self): + return oct(self).rstrip("L") + + +class digest(FieldType): + __md5 = __md5_bin = None + __sha1 = __sha1_bin = None + __sha256 = __sha256_bin = None + + def __init__(self, value=None, **kwargs): + if isinstance(value, (tuple, list)): + self.md5, self.sha1, self.sha256 = value + elif isinstance(value, dict): + self.md5 = value.get("md5", self.md5) + self.sha1 = value.get("sha1", self.sha1) + self.sha256 = value.get("sha256", self.sha256) + + @classmethod + def default(cls): + """Override default so the field is always a digest() instance.""" + return cls() + + def __repr__(self): + return "(md5={d.md5}, sha1={d.sha1}, sha256={d.sha256})".format(d=self) + + @property + def md5(self): + return self.__md5 + + @property + def sha1(self): + return self.__sha1 + + @property + def sha256(self): + return self.__sha256 + + @md5.setter + def md5(self, val): + if val is None: + self.__md5 = self.__md5_bin = None + return + try: + self.__md5_bin = a2b_hex(val) + self.__md5 = val + if len(self.__md5_bin) != 16: + raise TypeError("Incorrect hash length") + except binascii.Error as e: + raise TypeError("Invalid MD5 value {!r}, {}".format(val, e)) + + @sha1.setter + def sha1(self, val): + if val is None: + self.__sha1 = self.__sha1_bin = None + return + try: + self.__sha1_bin = a2b_hex(val) + self.__sha1 = val + if len(self.__sha1_bin) != 20: + raise TypeError("Incorrect hash length") + except binascii.Error as e: + raise TypeError("Invalid SHA-1 value {!r}, {}".format(val, e)) + + @sha256.setter + def sha256(self, val): + if val is None: + self.__sha256 = self.__sha256_bin = None + return + try: + self.__sha256_bin = a2b_hex(val) + self.__sha256 = val + if len(self.__sha256_bin) != 32: + raise TypeError("Incorrect hash length") + except binascii.Error as e: + raise TypeError("Invalid SHA-256 value {!r}, {}".format(val, e)) + + def _pack(self): + return ( + self.__md5_bin, + self.__sha1_bin, + self.__sha256_bin, + ) + + @classmethod + def _unpack(cls, data): + value = ( + b2a_hex(data[0]).decode() if data[0] else None, + b2a_hex(data[1]).decode() if data[1] else None, + b2a_hex(data[2]).decode() if data[2] else None, + ) + return cls(value) + + +class uri(string, FieldType): + + def __init__(self, value): + self._parsed = urlparse.urlparse(value) + + @staticmethod + def normalize(path): + r"""Normalize Windows paths to posix. + + c:\windows\system32\cmd.exe -> c:/windows/system32/cmd.exe + """ + return RE_NORMALIZE_PATH.sub('/', path) + + @classmethod + def from_windows(cls, path): + """Initialize a uri instance from a windows path.""" + return cls(uri.normalize(path)) + + @property + def scheme(self): + return self._parsed.scheme + + @property + def protocol(self): + return self.scheme + + @property + def netloc(self): + return self._parsed.netloc + + @property + def path(self): + return self._parsed.path + + @property + def params(self): + return self._parsed.params + + @property + def query(self): + return self._parsed.query + + @property + def args(self): + return self.query + + @property + def fragment(self): + return self._parsed.fragment + + @property + def username(self): + return self._parsed.username + + @property + def password(self): + return self._parsed.password + + @property + def hostname(self): + return self._parsed.hostname + + @property + def port(self): + return self._parsed.port + + @property + def filename(self): + return basename(self.path) + + @property + def dirname(self): + return dirname(self.path) + + +class record(FieldType): + + def __new__(cls, record_value): + return record_value + + def _pack(self): + return self.value + + @classmethod + def _unpack(cls, data): + return data diff --git a/flow/record/fieldtypes/credential.py b/flow/record/fieldtypes/credential.py new file mode 100644 index 0000000..cc87675 --- /dev/null +++ b/flow/record/fieldtypes/credential.py @@ -0,0 +1,9 @@ +from flow.record.fieldtypes import string + + +class username(string): + pass + + +class password(string): + pass diff --git a/flow/record/fieldtypes/net/__init__.py b/flow/record/fieldtypes/net/__init__.py new file mode 100644 index 0000000..10e83e3 --- /dev/null +++ b/flow/record/fieldtypes/net/__init__.py @@ -0,0 +1,15 @@ +from flow.record.fieldtypes import string +from .ip import ipaddress, ipnetwork, IPAddress, IPNetwork + +__all__ = [ + 'ipaddress', 'ipnetwork', + 'IPAddress', 'IPNetwork', +] + + +class hostname(string): + pass + + +class email(string): + pass diff --git a/flow/record/fieldtypes/net/ip.py b/flow/record/fieldtypes/net/ip.py new file mode 100644 index 0000000..b11c680 --- /dev/null +++ b/flow/record/fieldtypes/net/ip.py @@ -0,0 +1,80 @@ +from ipaddress import ip_address, ip_network +from flow.record.base import FieldType + + +class ipaddress(FieldType): + val = None + _type = "net.ipaddress" + + def __init__(self, addr): + self.val = ip_address(addr) + + def __eq__(self, b): + try: + return self.val == ip_address(b) + except ValueError: + return False + + def __str__(self): + return str(self.val) + + def __repr__(self): + return "{}({!r})".format(self._type, str(self)) + + def _pack(self): + return self.val.packed + + @staticmethod + def _unpack(data): + return ipaddress(data) + + +class ipnetwork(FieldType): + val = None + _type = "net.ipnetwork" + + def __init__(self, addr): + self.val = ip_network(addr) + + def __eq__(self, b): + try: + return self.val == ip_network(b) + except ValueError: + return False + + @staticmethod + def _is_subnet_of(a, b): + try: + # Always false if one is v4 and the other is v6. + if a._version != b._version: + raise TypeError("{} and {} are not of the same version".format(a, b)) + return (b.network_address <= a.network_address and + b.broadcast_address >= a.broadcast_address) + except AttributeError: + raise TypeError("Unable to test subnet containment " + "between {} and {}".format(a, b)) + + def __contains__(self, b): + try: + return self._is_subnet_of(ip_network(b), self.val) + except (ValueError, TypeError): + return False + + def __str__(self): + return str(self.val) + + def __repr__(self): + return "{}({!r})".format(self._type, str(self)) + + def _pack(self): + return self.val.compressed + + @staticmethod + def _unpack(data): + return ipnetwork(data) + + +# alias: net.IPAddress -> net.ipaddress +# alias: net.IPNetwork -> net.ipnetwork +IPAddress = ipaddress +IPNetwork = ipnetwork diff --git a/flow/record/fieldtypes/net/ipv4.py b/flow/record/fieldtypes/net/ipv4.py new file mode 100644 index 0000000..e271b74 --- /dev/null +++ b/flow/record/fieldtypes/net/ipv4.py @@ -0,0 +1,137 @@ +import struct +import socket + +from flow.record import FieldType +from flow.record.utils import to_native_str + + +def addr_long(s): + if isinstance(s, Address): + return s.val + + if isinstance(s, int): + return s + + return struct.unpack(">I", socket.inet_aton(s))[0] + + +def addr_str(s): + if isinstance(s, Address): + return socket.inet_ntoa(struct.pack(">I", s.val)) + + if isinstance(s, int): + return socket.inet_ntoa(struct.pack(">I", s)) + + return s + + +def mask_to_bits(n): + return bin(n).count("1") + + +def bits_to_mask(b): + return (0xffffffff << (32 - b)) & 0xffffffff + + +class subnet(FieldType): + net = None + mask = None + _type = "net.ipv4.subnet" + + def __init__(self, addr, netmask=None): + if isinstance(addr, type(u'')): + addr = to_native_str(addr) + + if not isinstance(addr, str): + raise TypeError("Subnet() argument 1 must be string, not {}".format(type(addr).__name__)) + + if netmask is None: + ip, sep, mask = addr.partition("/") + self.mask = bits_to_mask(int(mask)) if mask else 0xffffffff + self.net = addr_long(ip) + else: + self.net = addr_long(addr) + self.mask = bits_to_mask(netmask) + + if self.net & self.mask != self.net: + suggest = '{}/{}'.format(addr_str(self.net & self.mask), mask_to_bits(self.mask)) + raise ValueError("Not a valid subnet {!r}, did you mean {!r} ?".format(str(addr), suggest)) + + def __contains__(self, addr): + if addr is None: + return False + + if isinstance(addr, type(u'')): + addr = to_native_str(addr) + + if isinstance(addr, str): + addr = addr_long(addr) + + if isinstance(addr, Address): + addr = addr.val + + if isinstance(addr, int): + return addr & self.mask == self.net + + return False + + def __str__(self): + return "{0}/{1}".format(addr_str(self.net), mask_to_bits(self.mask)) + + def __repr__(self): + return "{}({!r})".format(self._type, str(self)) + + +class SubnetList: + subnets = None + + def __init__(self): + self.subnets = [] + + def load(self, path): + f = open(path, "rb") + for line in f: + entry, desc = line.split(" ", 1) + self.subnets.append(Subnet(entry)) + + f.close() + + def add(self, subnet): + self.subnets.append(Subnet(subnet)) + + def __contains__(self, addr): + if type(addr) is str: + addr = addr_long(addr) + + return any(addr in s for s in self.subnets) + + +class address(FieldType): + val = None + _type = "net.ipv4.address" + + def __init__(self, addr): + self.val = addr_long(addr) + + def __eq__(self, b): + return addr_long(self) == addr_long(b) + + def __str__(self): + return addr_str(self.val) + + def __repr__(self): + return "{}({!r})".format(self._type, str(self)) + + def _pack(self): + return self.val + + @staticmethod + def _unpack(data): + return address(data) + + +# Backwards compatiblity +Address = address +Subnet = subnet + +__all__ = ["address", "subnet", "Address", "Subnet", "SubnetList"] diff --git a/flow/record/fieldtypes/net/tcp.py b/flow/record/fieldtypes/net/tcp.py new file mode 100644 index 0000000..aa4f4d9 --- /dev/null +++ b/flow/record/fieldtypes/net/tcp.py @@ -0,0 +1,9 @@ +from flow.record.fieldtypes import uint16 + + +class port(uint16): + pass + + +# Backwards compatiblity +Port = port diff --git a/flow/record/fieldtypes/net/udp.py b/flow/record/fieldtypes/net/udp.py new file mode 100644 index 0000000..aa4f4d9 --- /dev/null +++ b/flow/record/fieldtypes/net/udp.py @@ -0,0 +1,9 @@ +from flow.record.fieldtypes import uint16 + + +class port(uint16): + pass + + +# Backwards compatiblity +Port = port diff --git a/flow/record/jsonpacker.py b/flow/record/jsonpacker.py new file mode 100644 index 0000000..ca4ae35 --- /dev/null +++ b/flow/record/jsonpacker.py @@ -0,0 +1,101 @@ +import json +import base64 +import logging +from datetime import datetime + +from . import fieldtypes +from .base import Record, RecordDescriptor +from .utils import EventHandler + +log = logging.getLogger(__package__) + + +class JsonRecordPacker: + + def __init__(self, indent=None): + self.descriptors = {} + self.on_descriptor = EventHandler() + self.indent = indent + + def register(self, desc, notify=False): + if not isinstance(desc, RecordDescriptor): + raise Exception("Expected Record Descriptor") + + # Descriptor already known + if desc.identifier in self.descriptors: + return + + # versioned record descriptor + self.descriptors[desc.identifier] = desc + + # for older non versioned records + self.descriptors[desc.name] = desc + + if notify and self.on_descriptor: + log.debug("JsonRecordPacker::on_descriptor {}".format(desc)) + self.on_descriptor(desc) + + def pack_obj(self, obj): + if isinstance(obj, Record): + if obj._desc.identifier not in self.descriptors: + self.register(obj._desc, True) + serial = obj._asdict() + serial['_type'] = 'record' + serial['_recorddescriptor'] = obj._desc.identifier + + # PYTHON2: Because "bytes" are also "str" we have to handle this here + for (field_type, field_name) in obj._desc.get_field_tuples(): + if field_type == "bytes" and isinstance(serial[field_name], str): + serial[field_name] = base64.b64encode(serial[field_name]).decode() + + return serial + if isinstance(obj, RecordDescriptor): + serial = { + '_type': 'recorddescriptor', + '_data': obj._pack(), + } + return serial + if isinstance(obj, datetime): + serial = obj.strftime("%Y-%m-%dT%H:%M:%S.%f") + return serial + if isinstance(obj, fieldtypes.digest): + return { + "md5": obj.md5, + "sha1": obj.sha1, + "sha256": obj.sha256, + } + if isinstance(obj, (fieldtypes.net.ipaddress, fieldtypes.net.ipnetwork)): + return str(obj) + if isinstance(obj, bytes): + return base64.b64encode(obj).decode() + + raise Exception("Unpackable type " + str(type(obj))) + + def unpack_obj(self, obj): + if isinstance(obj, dict): + _type = obj.get('_type', None) + if _type == "record": + record_descriptor_identifier = obj['_recorddescriptor'] + record_descriptor_identifier = tuple(record_descriptor_identifier) + record_descriptor = self.descriptors[record_descriptor_identifier] + del obj['_recorddescriptor'] + del obj['_type'] + for (field_type, field_name) in record_descriptor.get_field_tuples(): + if field_type == "bytes": + obj[field_name] = base64.b64decode(obj[field_name]) + result = record_descriptor.recordType(**obj) + return result + if _type == "recorddescriptor": + data = obj['_data'] + return RecordDescriptor._unpack(*data) + return obj + + def pack(self, obj): + return json.dumps(obj, default=self.pack_obj, indent=self.indent) + + def unpack(self, d): + record_dict = json.loads(d, object_hook=self.unpack_obj) + result = self.unpack_obj(record_dict) + if isinstance(result, RecordDescriptor): + self.register(result) + return result diff --git a/flow/record/packer.py b/flow/record/packer.py new file mode 100644 index 0000000..efcbf9b --- /dev/null +++ b/flow/record/packer.py @@ -0,0 +1,167 @@ +import warnings +import binascii +import datetime +import msgpack +import functools + +from . import fieldtypes +from .base import Record, FieldType, RecordDescriptor, GroupedRecord, RESERVED_FIELDS, RECORD_VERSION +from .utils import EventHandler, to_str + +# Override defaults for msgpack packb/unpackb +packb = functools.partial(msgpack.packb, use_bin_type=True) +unpackb = functools.partial(msgpack.unpackb, raw=False) + +RECORD_PACK_EXT_TYPE = 0xe + +RECORD_PACK_TYPE_RECORD = 0x1 +RECORD_PACK_TYPE_DESCRIPTOR = 0x2 +RECORD_PACK_TYPE_FIELDTYPE = 0x3 +RECORD_PACK_TYPE_DATETIME = 0x10 +RECORD_PACK_TYPE_VARINT = 0x11 +RECORD_PACK_TYPE_GROUPEDRECORD = 0x12 + + +def identifier_to_str(identifier): + if isinstance(identifier, tuple) and len(identifier) == 2: + return (to_str(identifier[0]), identifier[1]) + else: + return to_str(identifier) + + +class RecordPacker: + EXT_TYPE = RECORD_PACK_EXT_TYPE + TYPES = [FieldType, Record, RecordDescriptor] + + def __init__(self): + self.descriptors = {} + self.on_descriptor = EventHandler() + + def register(self, desc, notify=False): + if not isinstance(desc, RecordDescriptor): + raise Exception("Expected Record Descriptor") + + # versioned record descriptor + self.descriptors[desc.identifier] = desc + + # for older non versioned records + self.descriptors[desc.name] = desc + + if notify and self.on_descriptor: + self.on_descriptor(desc) + + def pack_obj(self, obj, unversioned=False): + packed = None + + if isinstance(obj, datetime.datetime): + t = obj.utctimetuple()[:6] + (obj.microsecond, ) + packed = (RECORD_PACK_TYPE_DATETIME, t) + + elif isinstance(obj, int): + neg = obj < 0 + h = hex(abs(obj))[2:].rstrip("L") + if len(h) % 2 != 0: + h = "0" + h + + packed = RECORD_PACK_TYPE_VARINT, (neg, binascii.a2b_hex(h)) + + elif isinstance(obj, GroupedRecord): + for desc in obj.descriptors: + if desc.identifier not in self.descriptors: + self.register(desc, True) + + packed = RECORD_PACK_TYPE_GROUPEDRECORD, obj._pack() + + elif isinstance(obj, Record): + if obj._desc.identifier not in self.descriptors: + self.register(obj._desc, True) + + data = obj._pack(unversioned=unversioned) + packed = RECORD_PACK_TYPE_RECORD, data + + elif isinstance(obj, RecordDescriptor): + packed = RECORD_PACK_TYPE_DESCRIPTOR, obj._pack() + + if not packed: + raise Exception("Unpackable type " + str(type(obj))) + + return msgpack.ExtType(RECORD_PACK_EXT_TYPE, self.pack(packed)) + + def pack(self, obj): + return packb(obj, default=self.pack_obj) + + def unpack_obj(self, t, data): + if t != RECORD_PACK_EXT_TYPE: + raise Exception("Unknown ExtType") + + subtype, value = self.unpack(data) + + if subtype == RECORD_PACK_TYPE_DATETIME: + dt = fieldtypes.datetime(*value) + return dt + + if subtype == RECORD_PACK_TYPE_VARINT: + neg, h = value + v = int(binascii.b2a_hex(h), 16) + if neg: + v = -v + + return v + + if subtype == RECORD_PACK_TYPE_RECORD: + identifier, values = value + identifier = identifier_to_str(identifier) + desc = self.descriptors[identifier] + + # Compatibility for older records + # We check the actual amount of values against the expected amount of values + # The values received include reserved fields, so we have to add them to the + # fields already declared in the descriptor. + # The descriptor should be received from the same stream, so any inconsistency + # in field count should be from reserved fields. + version = values[-1] + expected_len = len(desc.fields) + len(RESERVED_FIELDS) + + # Perform some basic checking on record version, if any, and issue a warning if needed. + if not isinstance(version, int) or version < 1 or version > 255: + warnings.warn( + ("Got old style record with no version information (expected {:d}). " + + "Compatibility is not guaranteed.").format( + RECORD_VERSION), RuntimeWarning) + elif version != RECORD_VERSION: + warnings.warn( + "Got other version record (expected {:d}, got {:d}). Compatibility is not guaranteed.".format( + RECORD_VERSION, version), RuntimeWarning) + # Optionally add compatibility code here later + + # If the actual amount of fields is less, there's nothing we can really do. + # If the actual amount of fields is more, we strip additional fields but + # maintain the version field + # This implies that any record that has _more_ reserved fields always + # has a version field. + if len(values) > expected_len: + # Likely newer style record. Strip extra fields but maintain version field + values = values[:expected_len - 1] + values += (version,) + + return desc.recordType._unpack(*values) + + if subtype == RECORD_PACK_TYPE_GROUPEDRECORD: + name, packed_records = value + records = [] + for value in packed_records: + identifier, values = value + identifier = identifier_to_str(identifier) + desc = self.descriptors[identifier] + records.append(desc.recordType._unpack(*values)) + return GroupedRecord(name, records) + + if subtype == RECORD_PACK_TYPE_DESCRIPTOR: + name, fields = value + name = to_str(name) + return RecordDescriptor._unpack(name, fields) + + raise Exception("Unknown subtype: %x" % subtype) + + def unpack(self, d): + return unpackb(d, ext_hook=self.unpack_obj, use_list=False) diff --git a/flow/record/selector.py b/flow/record/selector.py new file mode 100644 index 0000000..ba00308 --- /dev/null +++ b/flow/record/selector.py @@ -0,0 +1,714 @@ +import __future__ + +import ast +import operator +import re + +from flow.record.base import GroupedRecord, Record, dynamic_fieldtype +from flow.record.fieldtypes import net +from flow.record.whitelist import WHITELIST, WHITELIST_TREE + +try: + import astor + HAVE_ASTOR = True +except ImportError: + HAVE_ASTOR = False + +string_types = (str, type(u'')) + +AST_NODE_S_TYPES = tuple( + filter(None, [ + getattr(ast, "Str", None), + getattr(ast, "Bytes", None), + ]), +) + +AST_NODE_VALUE_TYPES = tuple( + filter(None, [ + getattr(ast, "NameConstant", None), + getattr(ast, "Constant", None), + ]), +) + +AST_OPERATORS = { + ast.Add: operator.add, + ast.Mult: operator.mul, + ast.Div: operator.truediv, + ast.And: operator.and_, + ast.Or: operator.or_, + ast.Not: operator.not_, + ast.Mod: operator.mod, + ast.BitAnd: operator.and_, + ast.BitOr: operator.or_, +} + +AST_COMPARATORS = { + ast.Eq: operator.eq, + ast.In: lambda left, right: + False if (isinstance(left, NoneObject) or isinstance(right, NoneObject)) + else operator.contains(right, left), + ast.NotIn: lambda left, right: + False if (isinstance(left, NoneObject) or isinstance(right, NoneObject)) + else operator.contains(right, left) is False, + ast.NotEq: operator.ne, + ast.Gt: operator.gt, + ast.Lt: operator.lt, + ast.GtE: operator.ge, + ast.LtE: operator.le, + ast.Is: operator.is_, + ast.IsNot: operator.is_not, +} + + +class NoneObject: + """Returned in the Selector matching if a field does not exist on the Record. + + NoneObject is used to override some comparators like __contains__. + """ + + def __eq__(a, b): + return False + + def __ne__(a, b): + return False + + def __lt__(a, b): + return False + + def __gt__(a, b): + return False + + def __lte__(a, b): + return False + + def __gte__(a, b): + return False + + def __noteq__(a, b): + return False + + def __contains__(a, b): + return False + + def __len__(self): + return 0 + + +NONE_OBJECT = NoneObject() + + +class InvalidSelectorError(Exception): + pass + + +class InvalidOperation(Exception): + pass + + +def lower(s): + """Return lowercased string, otherwise `s` if not string type.""" + if isinstance(s, string_types): + return s.lower() + return s + + +def upper(s): + """Return uppercased string, otherwise `s` if not string type.""" + if isinstance(s, string_types): + return s.upper() + return s + + +def names(r): + """Return the available names as a set in the Record otherwise ['UnknownRecord'].""" + if isinstance(r, GroupedRecord): + return set(sub_record._desc.name for sub_record in r.records) + if isinstance(r, (Record, WrappedRecord)): + return set([r._desc.name]) + return ["UnknownRecord"] + + +def name(r): + """Return the name of the Record otherwise 'UnknownRecord'.""" + if isinstance(r, (Record, WrappedRecord)): + return r._desc.name + return "UnknownRecord" + + +def get_type(obj): + """Return the type of the Object as 'str'.""" + return str(type(obj)) + + +def has_field(r, field): + """Check if field exists on Record object. + + Args: + r: Record to match on. + field_name: Field name + + Returns: + (bool): True if field exists, otherwise False + + """ + return field in r._desc.fields + + +def field_regex(r, fields, regex): + """Check a regex against fields of a Record object. + + Args: + r: The record to match on. + fields: The fields in the Record to match. + regex: The regex pattern to search for. + + Returns: + (bool): True or False + + """ + s_pattern = re.compile(regex) + for field in fields: + fvalue = getattr(r, field, NONE_OBJECT) + if fvalue is NONE_OBJECT: + continue + + match = re.search(s_pattern, fvalue) + if match is not None: + return True + return False + + +def field_equals(r, fields, strings, nocase=True): + """Check for exact string matches on fields of a Record object. + + Args: + r: The record to match on. + fields: The fields in the Record to match. + strings: The strings to search for. + nocase: Should the matching be case insensitive. + + Returns: + (bool): True or False + + """ + if nocase: + strings_to_check = [lower(s) for s in strings] + else: + strings_to_check = strings + + for field in fields: + fvalue = getattr(r, field, NONE_OBJECT) + if fvalue is NONE_OBJECT: + continue + if nocase: + fvalue = lower(fvalue) + for s in strings_to_check: + if s == fvalue: + return True + return False + + +def field_contains(r, fields, strings, nocase=True, word_boundary=False): + """Check if the string matches on fields of a Record object. + + Only supports strings for now and partial matches using the __contains__ operator. + + * `fields` is a list of field names to check + * `strings` is a list of strings to check on the fields + * `word_boundary` is a boolean. True if matching required only word boundary matches. + * Non existing fields on the Record object are skipped. + * Defaults to case-insensitive matching, use `nocase=False` if you want to be case sensitive. + """ + if nocase: + strings_to_check = [lower(s) for s in strings] + else: + strings_to_check = strings + + for field in fields: + fvalue = getattr(r, field, NONE_OBJECT) + if fvalue is NONE_OBJECT: + continue + if nocase: + fvalue = lower(fvalue) + for s in strings_to_check: + if word_boundary is False: + if s in fvalue: + return True + else: + if fvalue is None: + if s is None: + return True + continue + + if not isinstance(fvalue, string_types): + continue + + s_pattern = u"\\b{}\\b".format(re.escape(s)) + match = re.search(s_pattern, fvalue) + if match is not None: + return True + return False + + +# Function whitelist that are allowed in selectors +FUNCTION_WHITELIST = [ + lower, upper, name, names, get_type, field_contains, field_equals, field_regex, has_field, +] + + +def resolve_attr_path(node): + """Resolve a node attribute to full path, eg: net.ipv4.Subnet.""" + x = node.func + attr_path = [] + while isinstance(x, ast.Attribute): + attr_path.append(x.attr) + x = x.value + if isinstance(x, ast.Name): + attr_path.append(x.id) + return '.'.join(reversed(attr_path)) + + +class SelectorResult: + + def __init__(self, expression_str, match_result, backtrace, referenced_fields): + self.expresssion_str = expression_str + self.result = match_result + self.backtrace_info = backtrace + self.referenced_fields = referenced_fields + + def backtrace(self): + result = u"" + max_source_line_length = len(self.expresssion_str) + for row in self.backtrace_info[::-1]: + result += u"{}-> {}\n".format( + row[0].rstrip().ljust(max_source_line_length + 15), + row[1]) + return result + + +class Selector: + VERBOSITY_ALL = 1 + VERBOSITY_BRANCHES = 2 + VERBOSITY_NONE = 3 + + def __init__(self, expression): + expression = expression or "True" + self.expression_str = expression + self.expression = compile( + source=expression, + filename="", + mode="eval", + flags=ast.PyCF_ONLY_AST | __future__.unicode_literals.compiler_flag, + ) + self.matcher = None + + def __str__(self): + return self.expression_str + + def __repr__(self): + return 'Selector({!r})'.format(self.expression_str) + + def __contains__(self, record): + return self.match(record) + + def explain_selector(self, record, verbosity=VERBOSITY_ALL): + matcher = RecordContextMatcher(self.expression, self.expression_str, backtrace_verbosity=verbosity) + match_result = matcher.matches(record) + backtrace_info = matcher.selector_backtrace + if not HAVE_ASTOR: + backtrace_info.append(("WARNING: astor module not installed, trace not available", False)) + return SelectorResult(self.expression_str, match_result, backtrace_info, []) + + def match(self, record): + if not self.matcher: + self.matcher = RecordContextMatcher(self.expression, self.expression_str) + + result = self.matcher.matches(record) + return result + + +class WrappedRecord: + """WrappedRecord wraps a Record but will return a NoneObject for non existing attributes.""" + + __slots__ = ("record", ) + + def __init__(self, record): + self.record = record + + def __getattr__(self, k): + return getattr(self.record, k, NONE_OBJECT) + + +class CompiledSelector: + """CompiledSelector is faster than Selector but unsafe if you don't trust the query.""" + + def __init__(self, expression): + self.expression = expression or None + self.code = None + self.ns = {func.__name__: func for func in FUNCTION_WHITELIST} + self.ns["net"] = net + + if expression: + self.code = compile( + source=expression, + filename="", + mode="eval", + flags=__future__.unicode_literals.compiler_flag, + ) + + def __str__(self): + return self.expression + + def __repr__(self): + return 'CompiledSelector({!r})'.format(self.expression) + + def __contains__(self, record): + return self.match(record) + + def match(self, record): + if self.code is None: + return True + ns = self.ns.copy() + ns.update({ + "r": WrappedRecord(record), + "Type": TypeMatcher(record), + }) + return eval(self.code, ns) + + +class TypeMatcher: + """ + Helper to get and check fields of a certain type. + + Types can be selected using `Type.`. Attributes can be selected + using `Type..`. + + For example `Type.uri.filename` will retrieve all the filenames from all + uri's in a record. + + These selectors can also still be used in other helper functions, as + they will unwrap to resulting fieldnames. So for example, you can still + do `field_contains(r, Type.string, ['something'])`, which will check + all `string` fields. + + Membership tests also work. `'something' in Type.string` will perform + a membership test in each string value and return True if there are any. + + Reverse membership tests are trickier, and only work with a non-compiled + Selector. For example, `Type.net.ipv4.Address in net.ipv4.Subnet('10.0.0.0/8')` + requires the TypeMatcher to unroll its values, which is only possible + when overriding this behaviour. + """ + + def __init__(self, rec): + self._rec = rec + + def __getattr__(self, attr): + if attr in WHITELIST_TREE: + return TypeMatcherInstance(self._rec, [attr]) + + return NONE_OBJECT + + +class TypeMatcherInstance: + + def __init__(self, rec, ftypeparts=None, attrs=None): + self._rec = rec + self._ftypeparts = ftypeparts or [] + self._attrs = attrs or [] + + self._ftype = None + self._ftypetree = WHITELIST_TREE + for p in ftypeparts: + self._ftypetree = self._ftypetree[p] + + if self._ftypetree is True: + self._ftype = '.'.join(ftypeparts) + + def __getattr__(self, attr): + if not self._ftype: + if attr not in self._ftypetree: + return NONE_OBJECT + + ftypeparts = self._ftypeparts + [attr] + return TypeMatcherInstance(self._rec, ftypeparts) + elif not attr.startswith('_'): + attrs = self._attrs + [attr] + return TypeMatcherInstance(self._rec, self._ftypeparts, attrs) + + return NONE_OBJECT + + def __iter__(self): + return self._fields() + + def _fields(self): + for f in self._rec._desc.getfields(self._ftype): + yield f.name + + def _values(self): + for f in self._fields(): + obj = getattr(self._rec, f, NONE_OBJECT) + for a in self._attrs: + obj = getattr(obj, a, NONE_OBJECT) + + if obj is NONE_OBJECT: + continue + + yield obj + + def _subrecords(self): + """Return all fields that are records (records in records). + + Returns: list of records + """ + fields = self._rec._desc.getfields("record") + for f in fields: + r = getattr(self._rec, f.name) + if r is not None: + yield r + + fields = self._rec._desc.getfields("record[]") + for f in fields: + records = getattr(self._rec, f.name) + if records is not None: + for r in records: + yield r + + def _op(self, op, other): + for v in self._values(): + if op(v, other): + return True + + subrecords = self._subrecords() + for record in subrecords: + type_matcher = TypeMatcherInstance(record, self._ftypeparts, self._attrs) + if type_matcher._op(op, other): + return True + + return False + + def __eq__(self, other): + return self._op(operator.eq, other) + + def __ne__(self, other): + return self._op(operator.ne, other) + + def __lt__(self, other): + return self._op(operator.lt, other) + + def __gt__(self, other): + return self._op(operator.gt, other) + + def __lte__(self, other): + return self._op(operator.le, other) + + def __gte__(self, other): + return self._op(operator.ge, other) + + def __noteq__(self, other): + return self._op(operator.ne, other) + + def __contains__(self, other): + return self._op(operator.contains, other) + + +class RecordContextMatcher: + + def __init__(self, expr, expr_str, backtrace_verbosity=Selector.VERBOSITY_NONE): + self.expression = expr + self.expression_str = expr_str + self.selector_backtrace = [] + self.selector_backtrace_verbosity = backtrace_verbosity + self.data = {} + self.rec = None + + def matches(self, rec): + self.selector_backtrace = [] + self.data = { + "None": None, + "True": True, + "False": False, + "str": str, + "fields": rec._desc.getfields, + "any": any, + "all": all, + } + + # Add whitelisted functions to global dict + self.data.update({ + func.__name__: func for func in FUNCTION_WHITELIST + }) + + self.data["r"] = rec + self.rec = rec + + # This ensures backwards compatibility with old Selector queries + self.data["obj"] = rec + + # Type matcher + self.data["Type"] = TypeMatcher(rec) + + return self.eval(self.expression.body) + + def eval(self, node): + r = self._eval(node) + verbosity = self.selector_backtrace_verbosity + log_trace = ( + (verbosity == Selector.VERBOSITY_ALL) or + (verbosity == Selector.VERBOSITY_BRANCHES and isinstance(node, (ast.Compare, ast.BoolOp))) + ) + if log_trace and HAVE_ASTOR: + source_line = astor.to_source(node) + self.selector_backtrace.append((source_line, r)) + return r + + def _eval(self, node): + if isinstance(node, ast.Num): + return node.n + elif isinstance(node, AST_NODE_S_TYPES): + return node.s + elif isinstance(node, AST_NODE_VALUE_TYPES): + return node.value + elif isinstance(node, ast.List): + return list(map(self.eval, node.elts)) + elif isinstance(node, ast.Tuple): + return tuple(map(self.eval, node.elts)) + elif isinstance(node, ast.Name): + if node.id not in self.data: + return getattr(dynamic_fieldtype, node.id) + + return self.data[node.id] + elif isinstance(node, ast.Attribute): + if node.attr.startswith('__'): + raise InvalidOperation( + "Selector {!r} contains invalid attribute: {!r}".format( + self.expression_str, node.attr)) + + obj = self.eval(node.value) + + return getattr(obj, node.attr, NONE_OBJECT) + elif isinstance(node, ast.BoolOp): + values = [] + for expr in node.values: + try: + value = self.eval(expr) + except TypeError as e: + if 'NoneType' in str(e): + value = False + else: + raise + value = bool(value) + values.append(value) + result = values.pop(0) + for value in values: + result = AST_OPERATORS[type(node.op)](result, value) + return result + elif isinstance(node, ast.BinOp): + left = self.eval(node.left) + right = self.eval(node.right) + if isinstance(left, NoneObject) or isinstance(right, NoneObject): + return False + return AST_OPERATORS[type(node.op)](left, right) + elif isinstance(node, ast.UnaryOp): + return AST_OPERATORS[type(node.op)](self.eval(node.operand)) + elif isinstance(node, ast.Compare): + left = self.eval(node.left) + right = self.eval(node.comparators[0]) + + # print [AST_COMPARATORS[type(node.ops[0])](getattr(self.rec, l.name), right) for l in left] + # return [AST_COMPARATORS[type(node.ops[0])](getattr(self.rec, l.name), right) for l in left] + + comptype = type(node.ops[0]) + comp = AST_COMPARATORS[comptype] + + # Special case for __contains__, where we need to first unwrap all values matching the Type query + if comptype in (ast.In, ast.NotIn) and isinstance(left, TypeMatcherInstance): + for v in left._values(): + if comp(v, right): + return True + return False + return comp(left, right) + elif isinstance(node, ast.Call): + if not isinstance(node.func, (ast.Attribute, ast.Name)): + raise InvalidOperation("Error, only ast.Attribute or ast.Name are expected") + + func_name = resolve_attr_path(node) + if not (callable(self.data.get(func_name)) or func_name in WHITELIST): + raise InvalidOperation( + "Call '{}' not allowed. No calls other then whitelisted 'global' calls allowed!".format( + func_name)) + + func = self.eval(node.func) + + args = list(map(self.eval, node.args)) + kwargs = dict((kw.arg, self.eval(kw.value)) for kw in node.keywords) + + return func(*args, **kwargs) + + elif isinstance(node, ast.comprehension): + iter = self.eval(node.iter) + return iter + + elif isinstance(node, ast.GeneratorExp): + def recursive_generator(gens): + """ + Yield all the values in the most deepest generator. + + Example: + [ord(c) for line in file for c in line] + This function would yield all c values for this expression + + Args: + gens: A list of generator/ comprehension objects + + Returns: + Generator + """ + gens = list(gens) + gen = gens.pop() + loop_index_var_name = gen.target.id + resolved_gen = self.eval(gen) + if resolved_gen is not NONE_OBJECT: + for val in resolved_gen: + self.data[loop_index_var_name] = val + if len(gens) > 0: + for subval in recursive_generator(gens): + yield subval + else: + yield val + + def generator_expr(): + """ + Embedded generator logic for ast.GeneratorExp. + + A function can't yield and return so we write nested generator function and return that. + + Returns: + yields evaluated generator expression values + + """ + for gen in node.generators: + if gen.target.id in self.data: + raise InvalidOperation( + "Generator variable '{}' overwrites existing variable!".format( + gen.target.id)) + values = recursive_generator(node.generators[::-1]) + for val in values: + result = self.eval(node.elt) + yield result + return generator_expr() + + raise TypeError(node) + + +def make_selector(selector, force_compiled=False): + """Return a Selector object (either CompiledSelector or Selector).""" + ret = selector + if not selector: + ret = None + elif isinstance(selector, string_types): + ret = CompiledSelector(selector) if force_compiled else Selector(selector) + elif isinstance(selector, Selector): + if force_compiled: + ret = CompiledSelector(selector.expression_str) + return ret diff --git a/flow/record/stream.py b/flow/record/stream.py new file mode 100644 index 0000000..5723aec --- /dev/null +++ b/flow/record/stream.py @@ -0,0 +1,293 @@ +from __future__ import print_function + +import os +import sys +import struct +import logging +import datetime +from functools import lru_cache +from collections import ChainMap + +from .base import RecordDescriptor, RecordReader +from .packer import RecordPacker +from flow.record import RecordWriter +from flow.record.selector import make_selector +from flow.record.fieldtypes import fieldtype_for_value + + +log = logging.getLogger(__package__) + +RECORDSTREAM_MAGIC = b"RECORDSTREAM\n" + + +def RecordOutput(fp): + """Return a RecordPrinter if `fp` is a tty otherwise a RecordStreamWriter.""" + if hasattr(fp, "isatty") and fp.isatty(): + return RecordPrinter(fp) + return RecordStreamWriter(fp) + + +class RecordPrinter: + """Records are printed as textual representation (repr) to fp.""" + + fp = None + + def __init__(self, fp, flush=True): + self.fp = fp + self.auto_flush = flush + + def write(self, obj): + buf = repr(obj).encode() + b"\n" + self.fp.write(buf) + if self.auto_flush: + self.flush() + + def flush(self): + self.fp.flush() + + def close(self): + pass + + +class RecordStreamWriter: + """Records are written as binary (serialized) to fp.""" + + fp = None + packer = None + + def __init__(self, fp): + self.fp = fp + self.packer = RecordPacker() + self.packer.on_descriptor.add_handler(self.on_new_descriptor) + self.header_written = False + + def __del__(self): + self.close() + + def on_new_descriptor(self, descriptor): + self.write(descriptor) + + def close(self): + if self.fp and self.fp != getattr(sys.stdout, "buffer", sys.stdout): + self.fp.close() + self.fp = None + + def flush(self): + if not self.header_written: + self.writeheader() + + def write(self, obj): + if not self.header_written: + self.writeheader() + blob = self.packer.pack(obj) + self.fp.write(struct.pack(">I", len(blob))) + self.fp.write(blob) + + def writeheader(self): + self.header_written = True + self.write(RECORDSTREAM_MAGIC) + + +class RecordStreamReader: + fp = None + recordtype = None + descs = None + packer = None + + def __init__(self, fp, selector=None): + self.fp = fp + self.closed = False + self.selector = make_selector(selector) + self.packer = RecordPacker() + self.readheader() + + def readheader(self): + # Manually read the msgpack format to avoid unserializing invalid data + # we read size (4) + msgpack type (2) + msgpack bytes (recordstream magic) + header = self.fp.read(4 + 2 + len(RECORDSTREAM_MAGIC)) + if not header.endswith(RECORDSTREAM_MAGIC): + raise IOError("Unknown file format, not a RecordStream") + + def read(self): + d = self.fp.read(4) + if len(d) != 4: + raise EOFError() + + size = struct.unpack(">I", d)[0] + d = self.fp.read(size) + return self.packer.unpack(d) + + def close(self): + self.closed = True + + def __iter__(self): + try: + while not self.closed: + obj = self.read() + if obj == RECORDSTREAM_MAGIC: + continue + if isinstance(obj, RecordDescriptor): + self.packer.register(obj) + else: + if not self.selector or self.selector.match(obj): + yield obj + except EOFError: + pass + + +def record_stream(sources, selector=None): + """Return a Record stream generator from the given Record sources. + + Exceptions in a Record source will be caught so the stream is not interrupted. + """ + log.debug("Record stream with selector: {!r}".format(selector)) + for src in sources: + # Inform user that we are reading from stdin + if src in ("-", ""): + print("[reading from stdin]", file=sys.stderr) + + # Initial value for reader, in case of exception message + reader = "RecordReader" + try: + reader = RecordReader(src, selector=selector) + for rec in reader: + yield rec + reader.close() + except IOError as e: + log.error("{}({!r}): {}".format(reader, src, e)) + except KeyboardInterrupt: + raise + except Exception as e: # noqa: B902 + log.warning( + "Exception in {!r} for {!r}: {!r} -- skipping to next reader".format( + reader, src, e)) + continue + + +class PathTemplateWriter: + """Write records to a path on disk, path can be a template string. + + This allows for archiving records on disk based on timestamp for example. + + Default template string is: + + '{name}-{record._generated:%Y%m%dT%H}.records.gz' + + Available template fields: + + `name` defaults to "records", but can be overridden in the initializer. + `record` is the record object + `ts` is record._generated + + If the destination path already exists it will rename the existing file using the current datetime. + """ + + DEFAULT_TEMPLATE = '{name}-{record._generated:%Y%m%dT%H}.records.gz' + + def __init__(self, path_template=None, name=None): + self.path_template = path_template or self.DEFAULT_TEMPLATE + self.name = name or "records" + self.current_path = None + self.writer = None + self.stream = None + + def rotate_existing_file(self, path): + if os.path.exists(path): + now = datetime.datetime.utcnow() + src = os.path.realpath(path) + + src_dir = os.path.dirname(src) + src_fname = os.path.basename(src) + + # stamp will be part of new filename to denote rotation stamp + stamp = '{now:%Y%m%dT%H%M%S}'.format(now=now) + + # Use "records.gz" as the extension if we have this naming convention + if src_fname.endswith('.records.gz'): + fname, _ = src_fname.rsplit('.records.gz', 1) + ext = "records.gz" + else: + fname, ext = os.path.splitext(src_fname) + + # insert the rotation stamp into the new filename. + dst = os.path.join(src_dir, '{fname}.{stamp}.{ext}'.format(**locals())) + log.info('RENAME {!r} -> {!r}'.format(src, dst)) + os.rename(src, dst) + + def record_stream_for_path(self, path): + if self.current_path != path: + self.current_path = path + log.info('Writing records to {!r}'.format(path)) + self.rotate_existing_file(path) + dst_dir = os.path.dirname(path) + if not os.path.exists(dst_dir): + os.makedirs(dst_dir) + rs = RecordWriter(path) + self.close() + self.writer = rs + return self.writer + + def write(self, record): + ts = record._generated or datetime.datetime.utcnow() + path = self.path_template.format(name=self.name, record=record, ts=ts) + rs = self.record_stream_for_path(path) + rs.write(record) + rs.fp.flush() + + def close(self): + if self.writer: + self.writer.close() + + +class RecordArchiver(PathTemplateWriter): + """RecordWriter that writes/archives records to a path with YYYY/mm/dd.""" + + def __init__(self, archive_path, path_template=None, name=None): + path_template = path_template or self.DEFAULT_TEMPLATE + template = os.path.join(str(archive_path), "{ts:%Y/%m/%d}", path_template) + PathTemplateWriter.__init__(self, path_template=template, name=name) + + +class RecordFieldRewriter: + """Rewrite records using a new RecordDescriptor for chosen fields and/or excluded or new record fields.""" + + def __init__(self, fields=None, exclude=None, expression=None): + self.fields = fields or [] + self.exclude = exclude or [] + self.expression = compile(expression, '', 'exec') if expression else None + + @lru_cache(maxsize=256) + def record_descriptor_for_fields(self, descriptor, fields=None, exclude=None, new_fields=None): + if not fields and not exclude and not new_fields: + return descriptor + exclude = exclude or [] + desc_fields = [] + if fields: + for fname in fields: + if fname in exclude: + continue + field = descriptor.fields.get(fname, None) + if field: + desc_fields.append((field.typename, field.name)) + else: + desc_fields = [(ftype, fname) for (ftype, fname) in descriptor.get_field_tuples() if fname not in exclude] + if new_fields: + desc_fields.extend(new_fields) + return RecordDescriptor(descriptor.name, desc_fields) + + def rewrite(self, record): + if not self.fields and not self.exclude and not self.expression: + return record + + local_dict = {} + new_fields = [] + if self.expression: + exec(self.expression, record._asdict(), local_dict) + # convert new variables to new record fields (field type is derived from value) + new_fields = [(fieldtype_for_value(val, "string"), key) for key, val in local_dict.items()] + + RewriteRecord = self.record_descriptor_for_fields( + record._desc, tuple(self.fields), tuple(self.exclude), tuple(new_fields) + ) + # give new variables precendence + return RewriteRecord.init_from_dict(ChainMap(local_dict, record._asdict())) diff --git a/flow/record/tools/__init__.py b/flow/record/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flow/record/tools/geoip.py b/flow/record/tools/geoip.py new file mode 100644 index 0000000..3a940fa --- /dev/null +++ b/flow/record/tools/geoip.py @@ -0,0 +1,194 @@ +# Python imports +import re +import sys +import random +import argparse +import logging + +# Flow imports +from flow.record.utils import catch_sigpipe +from flow.record import ( + RecordDescriptor, + RecordWriter, + record_stream, + extend_record, +) + +# Third party imports +import maxminddb + + +logger = logging.getLogger(__name__) + +IPv4Record = RecordDescriptor( + "geo/ipv4", + [ + ("net.ipaddress", "ip"), + ], +) + +GeoRecord = RecordDescriptor( + "maxmind/geo", + [ + ("string", "country"), + ("string", "country_code"), + ("string", "city"), + ("float", "longitude"), + ("float", "latitude"), + ], +) + +AsnRecord = RecordDescriptor( + "maxmind/asn", + [ + ("string", "asn"), + ("string", "org"), + ], +) + +DEFAULT_CITY_DB = "/usr/share/GeoIP/GeoLite2-City.mmdb" +DEFAULT_ASN_DB = "/usr/share/GeoIP/GeoLite2-ASN.mmdb" +REGEX_IPV4 = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}") + + +def georecord_for_ip(city_db, ip): + r = city_db.get(ip) if city_db else None + if not r: + return GeoRecord() + + loc_dict = r.get("location", {}) + country_dict = r.get("country", {}) + city_dict = r.get("city", {}) + + country = country_dict.get("names", {}).get("en") + country_code = country_dict.get("iso_code") + city = city_dict.get("names", {}).get("en") + lon = loc_dict.get("longitude") + lat = loc_dict.get("latitude") + + return GeoRecord( + country=country, + country_code=country_code, + city=city, + longitude=lon, + latitude=lat, + ) + + +def asnrecord_for_ip(asn_db, ip): + r = asn_db.get(ip) if asn_db else None + if not r: + return AsnRecord() + asn = r.get("autonomous_system_number", None) + org = r.get("autonomous_system_organization", None) + return AsnRecord(asn=asn, org=org) + + +def ip_records_from_text_files(files): + """Yield IPv4Records by extracting IP addresses from `files` using a regex.""" + for fname in files: + with open(fname, "r") if fname != "-" else sys.stdin as f: + for line in f: + for ip in REGEX_IPV4.findall(line): + yield IPv4Record(ip) + + +@catch_sigpipe +def main(): + parser = argparse.ArgumentParser( + description="Annotate records with GeoIP and ASN data", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "-c", "--city-db", default=DEFAULT_CITY_DB, help="path to GeoIP city database" + ) + parser.add_argument( + "-a", "--asn-db", default=DEFAULT_ASN_DB, help="path to GeoIP ASN database" + ) + parser.add_argument( + "-i", + "--ip-field", + metavar="FIELD", + default="ip", + help="the source record field to use for lookups", + ) + parser.add_argument( + "-w", + "--writer", + metavar="OUTPUT", + default="-", + help="write records to output", + ) + parser.add_argument("input", nargs="*", default=["-"], help="input files") + parser.add_argument( + "-t", + "--text", + action="store_true", + help="treats input as text and extract IPv4 Records using regex", + ) + + # Hidden options + parser.add_argument( + "-m", "--mode", type=int, default=maxminddb.MODE_AUTO, help=argparse.SUPPRESS + ) + parser.add_argument("-g", "--generate", action="store_true", help=argparse.SUPPRESS) + args = parser.parse_args() + + if args.generate: + with RecordWriter() as writer: + while True: + record = IPv4Record(random.randint(0, 0xFFFFFFFF)) + writer.write(record) + + if args.mode: + logger.warning("MODE: %u", args.mode) + + try: + city_db = maxminddb.open_database(args.city_db, args.mode) + except FileNotFoundError: + logger.warning( + "[*] Disabled Geo record annotation. (database not found: %r)", + args.city_db, + ) + city_db = None + + try: + asn_db = maxminddb.open_database(args.asn_db, args.mode) + except FileNotFoundError: + logger.warning( + "[*] Disabled ASN record annotation. (database not found: %r)", args.asn_db + ) + asn_db = None + + if not any([city_db, asn_db]) and not args.text: + print( + "[!] Both City and ASN database not available. Nothing to annotate, exiting..", + file=sys.stderr, + ) + return 1 + + if args.text: + # Input are text files, extract IPv4Records from text using a regex + record_iterator = ip_records_from_text_files(args.input) + else: + # Input are Record files + record_iterator = record_stream(args.input) + + with RecordWriter(args.writer) as writer: + for record in record_iterator: + ip = getattr(record, args.ip_field, None) + + annotated_records = [] + if city_db: + geo_record = georecord_for_ip(city_db, str(ip)) if ip else GeoRecord() + annotated_records.append(geo_record) + if asn_db: + asn_record = asnrecord_for_ip(asn_db, str(ip)) if ip else AsnRecord() + annotated_records.append(asn_record) + + record = extend_record(record, annotated_records) + writer.write(record) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/flow/record/tools/rdump.py b/flow/record/tools/rdump.py new file mode 100644 index 0000000..3d550e9 --- /dev/null +++ b/flow/record/tools/rdump.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python +from __future__ import print_function + +import sys +import logging + +from flow.record import RecordWriter, record_stream +from flow.record.stream import RecordFieldRewriter +from flow.record.selector import make_selector +from flow.record.utils import catch_sigpipe + +try: + from flow.record.version import version +except ImportError: + version = "unknown" + +log = logging.getLogger(__name__) + +try: + # Python 2 + import urlparse + from urllib import urlencode +except ImportError: + # Python 3 + import urllib.parse as urlparse + from urllib.parse import urlencode + + +@catch_sigpipe +def main(): + import argparse + parser = argparse.ArgumentParser( + description="Record dumper, a tool that can read, write and filter records", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument( + '--version', action='version', version="flow.record version {}".format(version)) + parser.add_argument( + 'src', metavar='SOURCE', nargs='*', default=['-'], + help='Record source') + parser.add_argument( + '-v', '--verbose', action='count', default=0, + help='Increase verbosity') + + misc = parser.add_argument_group("miscellaneous") + misc.add_argument( + '-l', '--list', action='store_true', + help='List unique Record Descriptors') + misc.add_argument( + '-n', '--no-compile', action='store_true', + help="Don't use a compiled selector (safer, but slower)") + misc.add_argument( + '--record-source', default=None, + help='Overwrite the record source field') + misc.add_argument( + '--record-classification', default=None, + help='Overwrite the record classification field') + + selection = parser.add_argument_group('selection') + selection.add_argument( + '-F', '--fields', metavar='FIELDS', + help='Fields (comma seperated) to output in dumping') + selection.add_argument( + '-X', '--exclude', metavar='FIELDS', + help='Fields (comma seperated) to exclude in dumping') + selection.add_argument( + '-s', '--selector', metavar='SELECTOR', default=None, + help='Only output records matching Selector') + + output = parser.add_argument_group('output control') + output.add_argument( + '-f', '--format', metavar='FORMAT', + help='Format string') + output.add_argument( + '-c', '--count', type=int, + help='Exit after COUNT records') + output.add_argument( + '-w', '--writer', metavar='OUTPUT', default=None, + help='Write records to output') + output.add_argument( + '-m', '--mode', default=None, choices=("csv", "json", "jsonlines", "line"), + help='Output mode') + + advanced = parser.add_argument_group('advanced') + advanced.add_argument( + '-E', "--exec-expression", + help="execute a (Python) expression for each record AFTER selector matching, can be used to assign new fields") + + aliases = parser.add_argument_group('aliases') + aliases.add_argument( + '-j', '--json', action='store_const', const='json', dest='mode', + default=argparse.SUPPRESS, + help='Short for --mode=json') + aliases.add_argument( + '-J', '--jsonlines', action='store_const', const='jsonlines', dest='mode', + default=argparse.SUPPRESS, + help='Short for --mode=jsonlines') + aliases.add_argument( + '-C', '--csv', action='store_const', const='csv', dest='mode', + default=argparse.SUPPRESS, + help='Short for --mode=csv') + aliases.add_argument( + "-L", "--line", action='store_const', const='line', dest='mode', + default=argparse.SUPPRESS, + help='Short for --mode=line') + + args = parser.parse_args() + + levels = [logging.WARNING, logging.INFO, logging.DEBUG] + level = levels[min(len(levels) - 1, args.verbose)] + logging.basicConfig(level=level, format="%(asctime)s %(levelname)s %(message)s") + + fields_to_exclude = args.exclude.split(",") if args.exclude else [] + fields = args.fields.split(",") if args.fields else [] + + uri = args.writer or "text://" + if not args.writer: + mode_to_uri = { + "csv": "csvfile://", + "json": "jsonfile://?indent=2", + "jsonlines": "jsonfile://", + "line": "line://", + } + uri = mode_to_uri.get(args.mode, uri) + qparams = { + "fields": args.fields, + "exclude": args.exclude, + "format_spec": args.format, + } + query = urlencode({k: v for k, v in qparams.items() if v}) + uri += "&" if urlparse.urlparse(uri).query else "?" + query + + record_field_rewriter = None + if fields or fields_to_exclude or args.exec_expression: + record_field_rewriter = RecordFieldRewriter(fields, fields_to_exclude, args.exec_expression) + + selector = make_selector(args.selector, not args.no_compile) + seen_desc = set() + count = 0 + with RecordWriter(uri) as record_writer: + for count, rec in enumerate(record_stream(args.src, selector)): + if args.count and count >= args.count: + break + + if args.record_source is not None: + rec._source = args.record_source + if args.record_classification is not None: + rec._classification = args.record_classification + if record_field_rewriter: + rec = record_field_rewriter.rewrite(rec) + + # Dump RecordDescriptors + if args.list: + desc = rec._desc + if desc.descriptor_hash not in seen_desc: + seen_desc.add(desc.descriptor_hash) + print("# {}".format(desc)) + print(desc.definition()) + print() + continue + + record_writer.write(rec) + + if args.list: + print("Processed {} records".format(count)) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/flow/record/utils.py b/flow/record/utils.py new file mode 100644 index 0000000..bffccc0 --- /dev/null +++ b/flow/record/utils.py @@ -0,0 +1,87 @@ +import os +import sys +import base64 +from functools import wraps + +_native = str +_unicode = type(u'') +_bytes = type(b'') + + +def is_stdout(fp): + return fp == getattr(sys.stdout, "buffer", sys.stdout) + + +def to_bytes(value): + """Convert a value to a byte string.""" + if value is None or isinstance(value, _bytes): + return value + if isinstance(value, _unicode): + return value.encode("utf-8") + return _bytes(value) + + +def to_str(value): + """Convert a value to a unicode string.""" + if value is None or isinstance(value, _unicode): + return value + if isinstance(value, _bytes): + return value.decode("utf-8") + return _unicode(value) + + +def to_native_str(value): + """Convert a value to a native `str`.""" + if value is None or isinstance(value, _native): + return value + if isinstance(value, _unicode): + # Python 2: unicode -> str + return value.encode("utf-8") + if isinstance(value, _bytes): + # Python 3: bytes -> str + return value.decode("utf-8") + return _native(value) + + +def to_base64(value): + """Convert a value to a base64 string.""" + return base64.b64encode(value).decode() + + +def catch_sigpipe(func): + """Catches KeyboardInterrupt and BrokenPipeError (OSError 22 on Windows).""" + + @wraps(func) + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except KeyboardInterrupt: + print("Aborted!", file=sys.stderr) + return 1 + except (BrokenPipeError, OSError) as e: + exc_type = type(e) + # Only catch BrokenPipeError or OSError 22 + if (exc_type is BrokenPipeError) or (exc_type is OSError and e.errno == 22): + devnull = os.open(os.devnull, os.O_WRONLY) + os.dup2(devnull, sys.stdout.fileno()) + return 1 + # Raise other exceptions + raise + + return wrapper + + +class EventHandler: + + def __init__(self): + self.handlers = [] + + def add_handler(self, callback): + self.handlers.append(callback) + + def remove_handler(self, callback): + self.handlers.remove(callback) + + def __call__(self, *args, **kwargs): + for h in self.handlers: + h(*args, **kwargs) diff --git a/flow/record/whitelist.py b/flow/record/whitelist.py new file mode 100644 index 0000000..dee0add --- /dev/null +++ b/flow/record/whitelist.py @@ -0,0 +1,40 @@ +WHITELIST = [ + "boolean", + "dynamic", + "datetime", + "filesize", + "uint16", + "uint32", + "float", + "string", + "stringlist", + "dictlist", + "unix_file_mode", + "varint", + "wstring", + "net.ipv4.Address", + "net.ipv4.Subnet", + "net.tcp.Port", + "net.udp.Port", + "uri", + "digest", + "bytes", + "record", + "net.ipaddress", + "net.ipnetwork", + "net.IPAddress", + "net.IPNetwork", +] + + +WHITELIST_TREE = {} +for field in WHITELIST: + parent = None + obj = WHITELIST_TREE + for part in field.split('.'): + if part not in obj: + obj[part] = {} + parent = obj + obj = obj[part] + + parent[part] = True diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6d6687e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[build-system] +requires = ["setuptools>=43.0.0", "wheel", "setuptools_scm[toml]>=3.4.1"] +build-backend = "setuptools.build_meta" + +[tool.setuptools_scm] +write_to = "flow/record/version.py" + +[tool.black] +line-length = 120 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..34ae005 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,9 @@ +[metadata] +author = Dissect Team +author_email = dissect@fox-it.com +url = https://github.com/fox-it/flow.record +license = Affero General Public License v3 +long_description = file: README.md +license_files = LICENSE, COPYRIGHT +classifiers = + Programming Language :: Python :: 3 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4b3a22f --- /dev/null +++ b/setup.py @@ -0,0 +1,26 @@ +from setuptools import setup, find_packages + +setup( + name='flow.record', + packages=['flow.' + v for v in find_packages('flow')], + install_requires=[ + 'msgpack>=0.5.2', + ], + extras_require={ + # Note: these compression libraries do not work well with pypy + 'compression': [ + 'lz4', + 'zstandard', + ], + }, + namespace_packages=['flow'], + entry_points={ + 'console_scripts': [ + 'r=flow.record.tools.r:main', + 'rdd=flow.record.tools.rdd:main', + 'rselect=flow.record.tools.rselect:main', + 'rdump=flow.record.tools.rdump:main', + 'rgeoip=flow.record.tools.geoip:main', + ], + }, +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/selector_explain_example.py b/tests/selector_explain_example.py new file mode 100644 index 0000000..9520b4e --- /dev/null +++ b/tests/selector_explain_example.py @@ -0,0 +1,32 @@ +from flow.record.selector import Selector +from flow.record import RecordDescriptor + +desc = RecordDescriptor("test/samplerecord", [ + ("uint16", "value"), + ("string", "x"), +]) + + +def main(): + s_str = u"r.x == u'\\u2018Test\\u2019' or r.value == 17 or (r.value == 1337 and r.x == 'YOLO')" + print(u"Evaluating selector.... \n{}".format(s_str)) + print("\n") + s = Selector(s_str) + obj = desc(0, "Test") + obj.x = u"\u2018Test\u2019" + obj.value = 16 + val = s.explain_selector(obj) + print(val.backtrace()) + + +if __name__ == "__main__": + main() + + +""" +r.x == 'Test' or r.value == 17 -> True + r.x == 'Test' -> True + or + r.value == 17 -> False + +""" diff --git a/tests/standalone_test.py b/tests/standalone_test.py new file mode 100644 index 0000000..3d8749d --- /dev/null +++ b/tests/standalone_test.py @@ -0,0 +1,16 @@ +from __future__ import print_function + + +def main(glob): + for var, val in sorted(glob.items()): + if not var.startswith("test_"): + continue + + print("{:40s}".format(var), end="") + try: + val() + print("PASSED") + except Exception: # noqa: B902 + print("FAILED") + import traceback + traceback.print_exc() diff --git a/tests/test_compiled_selector.py b/tests/test_compiled_selector.py new file mode 100644 index 0000000..ff8995f --- /dev/null +++ b/tests/test_compiled_selector.py @@ -0,0 +1,37 @@ +from flow.record import RecordDescriptor +from flow.record.selector import CompiledSelector as Selector + + +def test_selector_func_name(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "query"), + ("string", "url"), + ]) + assert TestRecord(None, None) not in Selector("name(r) == 'foo/bar'") + assert TestRecord(None, None) in Selector("name(r) == 'test/record'") + + +def test_selector(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "query"), + ("string", "url"), + ]) + + assert TestRecord("foo", "bar") in Selector("r.query == 'foo'") + assert TestRecord(None, None) not in Selector("r.query == 'foo'") + assert TestRecord(None, None) not in Selector("name(r.query) == 'XX'") + + +def test_non_existing_field(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "query"), + ("string", "url"), + ]) + + assert TestRecord("foo", "bar") not in Selector("r.query and r.non_existing_field") + assert TestRecord("foo", "bar") in Selector("not r.non_existing_field") + assert TestRecord("foo", "bar") in Selector("r.query and r.url and not r.non_existing_field") + + +if __name__ == "__main__": + __import__("standalone_test").main(globals()) diff --git a/tests/test_fieldtype_ip.py b/tests/test_fieldtype_ip.py new file mode 100644 index 0000000..94a683f --- /dev/null +++ b/tests/test_fieldtype_ip.py @@ -0,0 +1,238 @@ +from __future__ import unicode_literals + +import pytest + +from flow.record import RecordDescriptor +from flow.record import RecordPacker +from flow.record.fieldtypes import net +from flow.record.selector import Selector, CompiledSelector + + +def test_field_ipaddress(): + a = net.IPAddress("192.168.1.1") + assert a == "192.168.1.1" + + with pytest.raises(ValueError) as excinfo: + net.IPAddress("a.a.a.a") + excinfo.match(".* does not appear to be an IPv4 or IPv6 address") + + +def test_field_ipnetwork(): + a = net.IPNetwork("192.168.1.0/24") + assert a == "192.168.1.0/24" + + # Host bits set + with pytest.raises(ValueError) as excinfo: + net.IPNetwork("192.168.1.10/24") + excinfo.match(".* has host bits set") + + +def test_record_ipaddress(): + TestRecord = RecordDescriptor("test/ipaddress", [ + ("net.ipaddress", "ip"), + ]) + + r = TestRecord("127.0.0.1") + assert r.ip == "127.0.0.1" + assert r.ip != "lala.1234.bad.ip" + assert isinstance(r.ip, net.ipaddress) + assert repr(r.ip) == "net.ipaddress('127.0.0.1')" + + # ipv4 + assert TestRecord("1.1.1.1").ip == "1.1.1.1" + assert TestRecord("0.0.0.0").ip == "0.0.0.0" + assert TestRecord("192.168.0.1").ip == "192.168.0.1" + assert TestRecord("255.255.255.255").ip == "255.255.255.255" + + # ipv6 + assert TestRecord("::1").ip == "::1" + assert TestRecord("2001:4860:4860::8888").ip == "2001:4860:4860::8888" + assert TestRecord("2001:4860:4860::4444").ip == "2001:4860:4860::4444" + + # instantiate from different types + assert TestRecord(1).ip == "0.0.0.1" + assert TestRecord(0x7f0000ff).ip == "127.0.0.255" + assert TestRecord(b"\x7f\xff\xff\xff").ip == "127.255.255.255" + assert TestRecord("127.0.0.1").ip == "127.0.0.1" + + # invalid ip addresses + for invalid in ["1.1.1.256", "192.168.0.1/24", "a.b.c.d", ":::::1"]: + with pytest.raises(Exception) as excinfo: + TestRecord(invalid) + excinfo.match(r'.*does not appear to be an IPv4 or IPv6 address*') + + r = TestRecord() + assert r.ip is None + + +def test_record_ipnetwork(): + TestRecord = RecordDescriptor("test/ipnetwork", [ + ("net.ipnetwork", "subnet"), + ]) + + # ipv4 + r = TestRecord("192.168.0.0/24") + assert r.subnet == "192.168.0.0/24" + assert r.subnet != "bad.sub/net" + assert "bad.ip" not in r.subnet + assert "192.168.0.1" in r.subnet + assert "192.168.0.2/32" in r.subnet + assert "192.168.0.255" in r.subnet + assert "192.168.0.128/30" in r.subnet + assert "192.168.1.1" not in r.subnet + assert isinstance(r.subnet, net.ipnetwork) + assert repr(r.subnet) == "net.ipnetwork('192.168.0.0/24')" + + r = TestRecord("192.168.1.1/32") + assert r.subnet == "192.168.1.1" + assert r.subnet == "192.168.1.1/32" + assert "192.168.1.1" in r.subnet + assert "192.168.1.1/32" in r.subnet + + # ipv6 - https://en.wikipedia.org/wiki/IPv6_address + r = TestRecord("::1") + assert r.subnet == "::1" + assert r.subnet == "::1/128" + + r = TestRecord("::/0") + assert "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff" in r.subnet + assert "::" in r.subnet + assert "::1" in r.subnet + + r = TestRecord("64:ff9b::/96") + assert "64:ff9b::0.0.0.0" in r.subnet + assert "64:ff9b::255.255.255.255" in r.subnet + + +@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector]) +def test_selector_ipaddress(PSelector): + TestRecord = RecordDescriptor("test/ipaddress", [ + ("string", "description"), + ("net.ipaddress", "ip"), + ]) + + records = [ + TestRecord("Google DNS IPv4", "8.8.8.8"), + TestRecord("Google DNS IPv4", "8.8.4.4"), + TestRecord("Google DNS IPv6", "2001:4860:4860::8888"), + TestRecord("Google DNS IPv6", "2001:4860:4860::4444"), + ] + + recs = [r for r in records if r in PSelector("r.ip in net.ipnetwork('8.8.0.0/16')")] + assert len(recs) == 2 + + recs = [r for r in records if r in PSelector("r.ip == '8.8.8.8'")] + assert len(recs) == 1 + + recs = [r for r in records if r in PSelector("r.ip in net.ipnetwork('2001:4860:4860::/48')")] + assert len(recs) == 2 + + record = TestRecord("Optional", None) + assert record not in PSelector("r.ip == '1.1.1.1'") + assert record in PSelector("r.ip == None") + assert record in PSelector("not r.ip") + + +@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector]) +def test_selector_ipnetwork(PSelector): + TestRecord = RecordDescriptor("test/ipnetwork", [ + ("string", "description"), + ("net.ipnetwork", "subnet"), + ]) + + records = [ + # ipv4 + TestRecord("RFC1918", "10.0.0.0/8"), + TestRecord("RFC1918", "172.16.0.0/12"), + TestRecord("RFC1918", "192.168.0.0/16"), + # ipv6 + TestRecord("Private network", "fc00::/7"), + TestRecord("Link local", "fe80::/10"), + TestRecord("Facebook IPv6 range", "2a03:2880::/32"), + ] + recs = [r for r in records if r in PSelector("'fe80::1ff:fe23:4567:890a' in r.subnet")] + assert len(recs) == 1 + + recs = [r for r in records if r in PSelector("'2a03:2880:f003:c07:face:b00c::2' in r.subnet")] + assert len(recs) == 1 + + recs = [r for r in records if r in PSelector("'192.168.1.0/24' in r.subnet")] + assert len(recs) == 1 + assert recs[0].subnet == "192.168.0.0/16" + + recs = [r for r in records if r in PSelector("'192.168.1.141' in r.subnet")] + assert len(recs) == 1 + assert recs[0].subnet == "192.168.0.0/16" + + record = TestRecord("Google", "8.0.0.0/8") + assert record in PSelector("'8.8.4.4' in r.subnet") + assert record in PSelector("'8.8.8.8/32' in r.subnet") + assert record in PSelector("'8.8.0.0/16' in r.subnet") + assert record in PSelector("'8.8.4.0/24' in r.subnet") + assert record in PSelector("'8.8.8.0/24' in r.subnet") + + record = TestRecord("Optional", None) + assert record not in PSelector("r.subnet and '1.1.1.1' in r.subnet") + assert record in PSelector("r.subnet == None") + assert record in PSelector("not r.subnet") + + +@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector]) +def test_selector_ipaddress_in_ipnetwork(PSelector): + TestRecord = RecordDescriptor("test/scandata", [ + ("net.ipaddress", "ip"), + ("uint16", "port"), + ("string", "description"), + ]) + + records = [ + TestRecord("8.8.8.8", 53, "google"), + TestRecord("1.1.1.1", 53, "cloudflare"), + TestRecord("2620:fe::9", 53, "quad9"), + TestRecord(None, None, "empty"), + ] + + for record in records: + if record in PSelector('r.ip in net.ipnetwork("8.8.0.0/16")'): + assert record.ip == "8.8.8.8" + + for record in records: + if record in PSelector('r.ip in net.ipnetwork("1.1.1.1/32")'): + assert record.ip == "1.1.1.1" + + for record in records: + if record in PSelector('r.ip in net.ipnetwork("2620:FE::/48")'): + assert record.description == "quad9" + assert record.ip == "2620:00fe:0:0:0:0:0:0009" + + +def test_pack_ipaddress(): + packer = RecordPacker() + + TestRecord = RecordDescriptor("test/ipaddress", [ + ("net.ipaddress", "ip"), + ]) + + record_in = TestRecord("10.22.99.255") + data = packer.pack(record_in) + record_out = packer.unpack(data) + assert record_in == record_out + + # ip should be encoded as dword/bytes + assert b"\x0a\x16\x63\xff" in data + + +def test_pack_ipnetwork(): + packer = RecordPacker() + + TestRecord = RecordDescriptor("test/ipnetwork", [ + ("net.ipnetwork", "subnet"), + ]) + + record_in = TestRecord("172.16.0.0/16") + data = packer.pack(record_in) + record_out = packer.unpack(data) + assert record_in == record_out + + # subnet should be encoded as string + assert b"172.16.0.0/16" in data diff --git a/tests/test_fieldtypes.py b/tests/test_fieldtypes.py new file mode 100644 index 0000000..2854b45 --- /dev/null +++ b/tests/test_fieldtypes.py @@ -0,0 +1,458 @@ +# coding: utf-8 + +import pytest +import datetime +import hashlib + +from flow.record import RecordDescriptor +from flow.record.fieldtypes import net +from flow.record.fieldtypes import uri +from flow.record.fieldtypes import fieldtype_for_value +import flow.record.fieldtypes + +INT64_MAX = (1 << 63) - 1 +INT32_MAX = (1 << 31) - 1 +INT16_MAX = (1 << 15) - 1 + +UINT128_MAX = (1 << 128) - 1 +UINT64_MAX = (1 << 64) - 1 +UINT32_MAX = (1 << 32) - 1 +UINT16_MAX = (1 << 16) - 1 + + +def test_uint16(): + desc = RecordDescriptor("test/uint16", [ + ("uint16", "value"), + ]) + + # valid + desc.recordType(0x0) + desc.recordType(0x1) + desc.recordType(UINT16_MAX) + + # invalid + with pytest.raises(ValueError): + desc.recordType(-1) + + with pytest.raises(ValueError): + desc.recordType(UINT16_MAX + 1) + + with pytest.raises((ValueError, OverflowError)): + desc.recordType(UINT128_MAX) + + +def test_uint32(): + TestRecord = RecordDescriptor("test/uint32", [ + ("uint32", "value"), + ]) + + # valid + TestRecord(0x0) + TestRecord(0x1) + TestRecord(UINT16_MAX) + TestRecord(UINT32_MAX) + + # invalid + with pytest.raises(ValueError): + TestRecord(-1) + + with pytest.raises(ValueError): + TestRecord(UINT32_MAX + 1) + + with pytest.raises((ValueError, OverflowError)): + TestRecord(UINT128_MAX) + + +def test_net_ipv4_address(): + TestRecord = RecordDescriptor("test/net/ipv4/address", [ + ("net.ipv4.Address", "ip"), + ]) + + TestRecord("1.1.1.1") + TestRecord("0.0.0.0") + TestRecord("192.168.0.1") + TestRecord("255.255.255.255") + + r = TestRecord(u"127.0.0.1") + + assert isinstance(r.ip, net.ipv4.Address) + + for invalid in ["1.1.1.256", "192.168.0.1/24", "a.b.c.d"]: + with pytest.raises(Exception) as excinfo: + TestRecord(invalid) + excinfo.match(r'.*illegal IP address string.*') + + r = TestRecord() + assert r.ip is None + + +def test_net_ipv4_subnet(): + TestRecord = RecordDescriptor("test/net/ipv4/subnet", [ + ("net.ipv4.Subnet", "subnet"), + ]) + + r = TestRecord("1.1.1.0/24") + assert str(r.subnet) == "1.1.1.0/24" + + assert "1.1.1.1" in r.subnet + assert "1.1.1.2" in r.subnet + + assert "1.1.2.1" not in r.subnet + # assert "1.1.1.1/32" not in r.subnet + + r = TestRecord("0.0.0.0") + r = TestRecord("192.168.0.1") + r = TestRecord("255.255.255.255") + + r = TestRecord(u"127.0.0.1") + + for invalid in ["a.b.c.d", "foo", "bar", ""]: + with pytest.raises(Exception) as excinfo: + TestRecord(invalid) + excinfo.match(r'.*illegal IP address string.*') + + for invalid in [1, 1.0, sum, dict(), list(), True]: + with pytest.raises(TypeError) as excinfo: + TestRecord(invalid) + excinfo.match(r'Subnet\(\) argument 1 must be string, not .*') + + with pytest.raises(ValueError) as excinfo: + TestRecord("192.168.0.106/28") + excinfo.match(r"Not a valid subnet '192\.168\.0\.106/28', did you mean '192\.168\.0\.96/28' ?") + + +def test_bytes(): + TestRecord = RecordDescriptor("test/string", [ + ("string", "url"), + ("bytes", "body"), + ]) + + r = TestRecord("url", b"some bytes") + assert r.body == b"some bytes" + + with pytest.raises(TypeError) as excinfo: + r = TestRecord("url", 1234) + excinfo.match(r"Value not of bytes type") + + with pytest.raises(TypeError) as excinfo: + r = TestRecord("url", u"a string") + excinfo.match(r"Value not of bytes type") + + b_array = bytes(bytearray(range(256))) + body = b"HTTP/1.1 200 OK\r\n\r\n" + b_array + r = TestRecord("http://www.fox-it.com", body) + assert r + assert r.url == u"http://www.fox-it.com" + assert r.body == b"HTTP/1.1 200 OK\r\n\r\n" + b_array + + # testcase when input are bytes + r = TestRecord("http://www.fox-it.com", b'HTTP/1.1 500 Error\r\n\r\nError') + assert r.body == b"HTTP/1.1 500 Error\r\n\r\nError" + + +def test_string(): + TestRecord = RecordDescriptor("test/string", [ + ("string", "name"), + ]) + + r = TestRecord("Fox-IT") + assert r.name == u"Fox-IT" + + r = TestRecord(u"Rémy") + assert r.name == u"Rémy" + + # construct from 'bytes' + r = TestRecord(b'R\xc3\xa9my') + assert r.name == u"Rémy" + + # construct from 'bytes' but with invalid unicode bytes + if isinstance(u'', str): + # Python 3 + with pytest.raises(UnicodeDecodeError): + TestRecord(b'R\xc3\xa9\xeamy') + else: + # Python 2 + with pytest.warns(RuntimeWarning): + r = TestRecord(b'R\xc3\xa9\xeamy') + assert r.name + + +def test_wstring(): + # Behaves the same as test/string, only available for backwards compatibility purposes + TestRecord = RecordDescriptor("test/wstring", [ + ("wstring", "name"), + ]) + + r = TestRecord("Fox-IT") + assert r.name == u"Fox-IT" + + +def test_typedlist(): + TestRecord = RecordDescriptor("test/typedlist", [ + ("string[]", "string_value"), + ("uint32[]", "uint32_value"), + ("uri[]", "uri_value"), + ]) + + r = TestRecord(['a', 'b', 'c'], [1, 2, 3], ["/etc/passwd", "/etc/shadow"]) + assert len(r.string_value) == 3 + assert len(r.uint32_value) == 3 + assert len(r.uri_value) == 2 + assert r.string_value[2] == 'c' + assert r.uint32_value[1] == 2 + assert all([isinstance(v, uri) for v in r.uri_value]) + assert r.uri_value[1].filename == 'shadow' + + r = TestRecord() + assert r.string_value == [] + assert r.uint32_value == [] + assert r.uri_value == [] + + with pytest.raises(ValueError): + r = TestRecord(uint32_value=['a', 'b', 'c']) + + +def test_stringlist(): + TestRecord = RecordDescriptor("test/string", [ + ("stringlist", "value"), + ]) + + r = TestRecord(['a', 'b', 'c']) + assert len(r.value) == 3 + assert r.value[2] == 'c' + + r = TestRecord([u"Rémy"]) + assert r.value[0] + + +def test_dictlist(): + TestRecord = RecordDescriptor("test/dictlist", [ + ("dictlist", "hits"), + ]) + + r = TestRecord([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) + assert len(r.hits) == 2 + assert r.hits == [{"a": 1, "b": 2}, {"a": 3, "b": 4}] + assert r.hits[0]["a"] == 1 + assert r.hits[0]["b"] == 2 + assert r.hits[1]["a"] == 3 + assert r.hits[1]["b"] == 4 + + +def test_boolean(): + TestRecord = RecordDescriptor("test/boolean", [ + ("boolean", "booltrue"), + ("boolean", "boolfalse"), + ]) + + r = TestRecord(True, False) + assert bool(r.booltrue) is True + assert bool(r.boolfalse) is False + + r = TestRecord(1, 0) + assert bool(r.booltrue) is True + assert bool(r.boolfalse) is False + + assert str(r.booltrue) == "True" + assert str(r.boolfalse) == "False" + + assert repr(r.booltrue) == "True" + assert repr(r.boolfalse) == "False" + + with pytest.raises(ValueError): + r = TestRecord(2, -1) + + with pytest.raises(ValueError): + r = TestRecord('True', 'False') + + +def test_float(): + TestRecord = RecordDescriptor("test/float", [ + ("float", "value"), + ]) + + # initialize via float + r = TestRecord(1.3337) + assert r.value == 1.3337 + + # initialize via string + r = TestRecord("1.3337") + assert r.value == 1.3337 + + # initialize via int + r = TestRecord("1337") + assert r.value == 1337.0 + + # negative float + r = TestRecord(-12345) + assert r.value == -12345 + + # invalid float + with pytest.raises(ValueError): + r = TestRecord("abc") + + +def test_uri_type(): + TestRecord = RecordDescriptor("test/uri", [ + ("uri", "path"), + ]) + + r = TestRecord("http://www.google.com/a.bin") + assert r.path.filename == "a.bin" + assert r.path.dirname == "/" + assert r.path.hostname == "www.google.com" + assert r.path.protocol == "http" + assert r.path.protocol == r.path.scheme + assert r.path.path == "/a.bin" + + r = TestRecord("http://username:password@example.com/path/file.txt?query=1") + assert r.path.filename == "file.txt" + assert r.path.dirname == "/path" + assert r.path.args == "query=1" + assert r.path.username == "username" + assert r.path.password == "password" + assert r.path.protocol == "http" + assert r.path.hostname == "example.com" + + r = TestRecord(uri.from_windows(r"c:\windows\program files\Fox-IT B.V\flow.exe")) + assert r.path.filename == "flow.exe" + + r = TestRecord() + r.path = uri.normalize(r"c:\Users\Fox-IT\Downloads\autoruns.exe") + assert r.path.filename == "autoruns.exe" + assert r.path.dirname == uri.normalize(r"\Users\Fox-IT\Downloads") + assert r.path.dirname == "/Users/Fox-IT/Downloads" + + r = TestRecord() + r.path = "/usr/local/bin/sshd" + assert r.path.filename == "sshd" + assert r.path.dirname == "/usr/local/bin" + + +def test_datetime(): + TestRecord = RecordDescriptor("test/datetime", [ + ("datetime", "ts"), + ]) + + now = datetime.datetime.utcnow() + r = TestRecord(now) + assert r.ts == now + + r = TestRecord(u"2018-03-22T15:15:23") + assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23) + + r = TestRecord(u"2018-03-22T15:15:23.000000") + assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23) + + r = TestRecord(u"2018-03-22T15:15:23.123456") + assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23, 123456) + + dt = datetime.datetime(2018, 3, 22, 15, 15, 23, 123456) + dt_str = dt.isoformat() + r = TestRecord(dt_str) + assert r.ts == dt + + r = TestRecord(1521731723) + assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23) + + +def test_digest(): + TestRecord = RecordDescriptor("test/digest", [ + ("digest", "digest"), + ]) + + md5 = hashlib.md5(b"hello").hexdigest() + sha1 = hashlib.sha1(b"hello").hexdigest() + sha256 = hashlib.sha256(b"hello").hexdigest() + + record = TestRecord() + assert isinstance(record.digest, flow.record.fieldtypes.digest) + + record = TestRecord((md5, sha1, sha256)) + assert record.digest.md5 == "5d41402abc4b2a76b9719d911017c592" + assert record.digest.sha1 == "aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d" + assert record.digest.sha256 == "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824" + + record = TestRecord(("5d41402abc4b2a76b9719d911017c592", None, None)) + assert record.digest.md5 == "5d41402abc4b2a76b9719d911017c592" + assert record.digest.sha1 is None + assert record.digest.sha256 is None + + record = TestRecord() + record.digest = (md5, sha1, sha256) + assert record.digest.md5 == md5 + assert record.digest.sha1 == sha1 + assert record.digest.sha256 == sha256 + + with pytest.raises(TypeError) as excinfo: + record = TestRecord(("a", sha1, sha256)) + excinfo.match(r'.*Invalid MD5.*Odd-length string') + + with pytest.raises(TypeError) as excinfo: + record = TestRecord(("aa", sha1, sha256)) + excinfo.match(r'.*Invalid MD5.*Incorrect hash length') + + with pytest.raises(TypeError) as excinfo: + record = TestRecord((md5, "aa", sha256)) + excinfo.match(r'.*Invalid SHA1.*') + + with pytest.raises(TypeError) as excinfo: + record = TestRecord((md5, sha1, "aa")) + excinfo.match(r'.*Invalid SHA256.*') + + record = TestRecord() + assert record.digest is not None + assert record.digest.md5 is None + assert record.digest.sha1 is None + assert record.digest.sha256 is None + with pytest.raises(TypeError) as excinfo: + record.digest.md5 = "INVALID MD5" + excinfo.match(r'.*Invalid MD5.*') + + +def test_dynamic(): + TestRecord = RecordDescriptor("test/dynamic", [ + ("dynamic", "value"), + ]) + + r = TestRecord(b"bytes") + assert r.value == b"bytes" + assert isinstance(r.value, flow.record.fieldtypes.bytes) + + r = TestRecord(u"string") + assert r.value == u"string" + assert isinstance(r.value, flow.record.fieldtypes.string) + + r = TestRecord(123) + assert r.value == 123 + assert isinstance(r.value, flow.record.fieldtypes.varint) + + r = TestRecord(True) + assert r.value + assert isinstance(r.value, flow.record.fieldtypes.boolean) + + r = TestRecord([1, 2, 3]) + assert r.value == [1, 2, 3] + assert isinstance(r.value, flow.record.fieldtypes.stringlist) + + now = datetime.datetime.utcnow() + r = TestRecord(now) + assert r.value == now + assert isinstance(r.value, flow.record.fieldtypes.datetime) + + +def test_fieldtype_for_value(): + assert fieldtype_for_value(True) == "boolean" + assert fieldtype_for_value(False) == "boolean" + assert fieldtype_for_value(1337) == "varint" + assert fieldtype_for_value(1.337) == "float" + assert fieldtype_for_value(b"\r\n") == "bytes" + assert fieldtype_for_value("hello world") == "string" + assert fieldtype_for_value(datetime.datetime.now()) == "datetime" + assert fieldtype_for_value([1, 2, 3, 4, 5]) == "string" + assert fieldtype_for_value([1, 2, 3, 4, 5], None) is None + assert fieldtype_for_value(object(), None) is None + + +if __name__ == "__main__": + __import__("standalone_test").main(globals()) diff --git a/tests/test_json_packer.py b/tests/test_json_packer.py new file mode 100644 index 0000000..cfce228 --- /dev/null +++ b/tests/test_json_packer.py @@ -0,0 +1,25 @@ +from __future__ import print_function +from datetime import datetime +from flow.record import JsonRecordPacker, RecordDescriptor + + +def test_record_in_record(): + packer = JsonRecordPacker() + dt = datetime.utcnow() + + RecordA = RecordDescriptor("test/record_a", [ + ("datetime", "some_dt"), + ]) + RecordB = RecordDescriptor("test/record_b", [ + ("record", "record"), + ("datetime", "some_dt"), + ]) + + record_a = RecordA(dt) + record_b = RecordB(record_a, dt) + + data_record_b = packer.pack(record_b) + record_b_unpacked = packer.unpack(data_record_b) + + assert record_b == record_b_unpacked + assert record_a == record_b_unpacked.record diff --git a/tests/test_json_record_adapter.py b/tests/test_json_record_adapter.py new file mode 100644 index 0000000..2b6b11a --- /dev/null +++ b/tests/test_json_record_adapter.py @@ -0,0 +1,71 @@ +import json +import datetime +from flow.record import RecordDescriptor, RecordWriter, RecordReader + + +def generate_records(count=100): + TestRecordEmbedded = RecordDescriptor("test/embedded_record", [ + ("datetime", "dt"), + ]) + TestRecord = RecordDescriptor("test/adapter", [ + ("uint32", "number"), + ("record", "record"), + ]) + + for i in range(count): + embedded = TestRecordEmbedded(datetime.datetime.utcnow()) + yield TestRecord(number=i, record=embedded) + + +def test_json_adapter(tmpdir): + json_file = tmpdir.join("records.json") + record_adapter_path = "jsonfile://{}".format(json_file) + writer = RecordWriter(record_adapter_path) + nr_records = 1337 + + for record in generate_records(nr_records): + writer.write(record) + writer.flush() + + nr_received_records = 0 + reader = RecordReader(record_adapter_path) + for record in reader: + nr_received_records += 1 + + assert nr_records == nr_received_records + + +def test_json_adapter_contextmanager(tmpdir): + json_file = tmpdir.join("records.json") + record_adapter_path = "jsonfile://{}".format(json_file) + with RecordWriter(record_adapter_path) as writer: + nr_records = 1337 + for record in generate_records(nr_records): + writer.write(record) + + nr_received_records = 0 + with RecordReader(record_adapter_path) as reader: + for record in reader: + nr_received_records += 1 + + assert nr_records == nr_received_records + + +def test_json_adapter_jsonlines(tmpdir): + json_file = tmpdir.join("data.jsonl") + + items = [ + {'some_float': 1.5, 'some_string': 'hello world', 'some_int': 1337, 'some_bool': True}, + {'some_float': 2.7, 'some_string': 'goodbye world', 'some_int': 12345, 'some_bool': False}, + ] + with open(json_file, "w") as fout: + for row in items: + fout.write(json.dumps(row) + "\n") + + record_adapter_path = "jsonfile://{}".format(json_file) + reader = RecordReader(record_adapter_path) + for index, record in enumerate(reader): + assert record.some_float == items[index]["some_float"] + assert record.some_string == items[index]["some_string"] + assert record.some_int == items[index]["some_int"] + assert record.some_bool == items[index]["some_bool"] diff --git a/tests/test_packer.py b/tests/test_packer.py new file mode 100644 index 0000000..4c5ffb2 --- /dev/null +++ b/tests/test_packer.py @@ -0,0 +1,216 @@ +import datetime + +from flow.record import fieldtypes +from flow.record import RecordDescriptor +from flow.record import RecordPacker +from flow.record.packer import RECORD_PACK_EXT_TYPE +from flow.record.fieldtypes import uri + + +def test_uri_packing(): + packer = RecordPacker() + + TestRecord = RecordDescriptor("test/uri", [ + ("uri", "path"), + ]) + + # construct with an url + record = TestRecord("http://www.google.com/evil.bin") + data = packer.pack(record) + record = packer.unpack(data) + assert record.path == "http://www.google.com/evil.bin" + assert record.path.filename == "evil.bin" + assert record.path.dirname == "/" + + # construct from uri() -> for windows=True + path = uri.from_windows(r"c:\Program Files\Fox-IT\flow is awesome.exe") + record = TestRecord(path) + data = packer.pack(record) + record = packer.unpack(data) + assert record.path == "c:/Program Files/Fox-IT/flow is awesome.exe" + assert record.path.filename == "flow is awesome.exe" + assert record.path.dirname == "/Program Files/Fox-IT" + + # construct using uri.from_windows() + path = uri.from_windows(r"c:\Users\Hello World\foo.bar.exe") + record = TestRecord(path) + data = packer.pack(record) + record = packer.unpack(data) + assert record.path == "c:/Users/Hello World/foo.bar.exe" + assert record.path.filename == "foo.bar.exe" + assert record.path.dirname == "/Users/Hello World" + + +def test_typedlist_packer(): + packer = RecordPacker() + TestRecord = RecordDescriptor("test/typedlist", [ + ("string[]", "string_value"), + ("uint32[]", "uint32_value"), + ("uri[]", "uri_value"), + ]) + + r1 = TestRecord(['a', 'b', 'c'], [1, 2, 3], ["/etc/passwd", "/etc/shadow"]) + data = packer.pack(r1) + r2 = packer.unpack(data) + + assert len(r1.string_value) == 3 + assert len(r1.uint32_value) == 3 + assert len(r1.uri_value) == 2 + assert r1.string_value[2] == 'c' + assert r1.uint32_value[1] == 2 + assert all([isinstance(v, uri) for v in r1.uri_value]) + assert r1.uri_value[1].filename == 'shadow' + + assert len(r2.string_value) == 3 + assert len(r2.uint32_value) == 3 + assert len(r2.uri_value) == 2 + assert r2.string_value[2] == 'c' + assert r2.uint32_value[1] == 2 + assert all([isinstance(v, uri) for v in r2.uri_value]) + assert r2.uri_value[1].filename == 'shadow' + + +def test_dictlist_packer(): + packer = RecordPacker() + TestRecord = RecordDescriptor("test/dictlist", [ + ("dictlist", "hits"), + ]) + + r1 = TestRecord([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) + data = packer.pack(r1) + r2 = packer.unpack(data) + + assert len(r1.hits) == 2 + assert r1.hits == [{"a": 1, "b": 2}, {"a": 3, "b": 4}] + assert r1.hits[0]["a"] == 1 + assert r1.hits[0]["b"] == 2 + assert r1.hits[1]["a"] == 3 + assert r1.hits[1]["b"] == 4 + + assert len(r2.hits) == 2 + assert r2.hits == [{"a": 1, "b": 2}, {"a": 3, "b": 4}] + assert r2.hits[0]["a"] == 1 + assert r2.hits[0]["b"] == 2 + assert r2.hits[1]["a"] == 3 + assert r2.hits[1]["b"] == 4 + + +def test_dynamic_packer(): + packer = RecordPacker() + TestRecord = RecordDescriptor("test/dynamic", [ + ("dynamic", "value"), + ]) + + t = TestRecord(123) + data = packer.pack(t) + r = packer.unpack(data) + + assert r.value == 123 + assert isinstance(r.value, fieldtypes.varint) + + t = TestRecord(b"bytes") + data = packer.pack(t) + r = packer.unpack(data) + + assert r.value == b"bytes" + assert isinstance(r.value, fieldtypes.bytes) + + t = TestRecord(u"string") + data = packer.pack(t) + r = packer.unpack(data) + + assert r.value == u"string" + assert isinstance(r.value, fieldtypes.string) + + t = TestRecord(True) + data = packer.pack(t) + r = packer.unpack(data) + + assert r.value + assert isinstance(r.value, fieldtypes.boolean) + + t = TestRecord([1, True, b"b", u"u"]) + data = packer.pack(t) + r = packer.unpack(data) + + assert r.value == [1, True, b"b", u"u"] + assert isinstance(r.value, fieldtypes.stringlist) + + now = datetime.datetime.utcnow() + t = TestRecord(now) + data = packer.pack(t) + r = packer.unpack(data) + + assert r.value == now + assert isinstance(r.value, fieldtypes.datetime) + + +def test_pack_record_desc(): + packer = RecordPacker() + TestRecord = RecordDescriptor("test/pack", [ + ("string", "a"), + ]) + ext_type = packer.pack_obj(TestRecord) + assert ext_type.code == RECORD_PACK_EXT_TYPE + assert ext_type.data == b"\x92\x02\x92\xa9test/pack\x91\x92\xa6string\xa1a" + desc = packer.unpack_obj(ext_type.code, ext_type.data) + assert desc.name == TestRecord.name + assert desc.fields.keys() == TestRecord.fields.keys() + assert desc._pack() == TestRecord._pack() + + +def test_pack_digest(): + packer = RecordPacker() + TestRecord = RecordDescriptor("test/digest", [ + ("digest", "digest"), + ]) + record = TestRecord(("d41d8cd98f00b204e9800998ecf8427e", None, None)) + data = packer.pack(record) + record = packer.unpack(data) + assert record.digest.md5 == "d41d8cd98f00b204e9800998ecf8427e" + assert record.digest.sha1 is None + assert record.digest.sha256 is None + + +def test_record_in_record(): + packer = RecordPacker() + dt = datetime.datetime.utcnow() + + RecordA = RecordDescriptor("test/record_a", [ + ("datetime", "some_dt"), + ]) + RecordB = RecordDescriptor("test/record_b", [ + ("record", "record"), + ("datetime", "some_dt"), + ]) + + record_a = RecordA(dt) + record_b = RecordB(record_a, dt) + + data_record_b = packer.pack(record_b) + record_b_unpacked = packer.unpack(data_record_b) + + assert record_b == record_b_unpacked + assert record_a == record_b_unpacked.record + + +def test_record_array(): + packer = RecordPacker() + + EmbeddedRecord = RecordDescriptor("test/record_a", [ + ("string", "some_field"), + ]) + ParentRecord = RecordDescriptor("test/record_b", [ + ("record[]", "subrecords"), + ]) + + parent = ParentRecord() + for i in range(3): + emb_record = EmbeddedRecord( + some_field="embedded record {}".format(i)) + parent.subrecords.append(emb_record) + + data_record_parent = packer.pack(parent) + parent_unpacked = packer.unpack(data_record_parent) + + assert parent == parent_unpacked diff --git a/tests/test_rdump.py b/tests/test_rdump.py new file mode 100644 index 0000000..b941b18 --- /dev/null +++ b/tests/test_rdump.py @@ -0,0 +1,178 @@ +import json +import base64 +import hashlib +import subprocess + +from flow.record import RecordDescriptor +from flow.record import RecordWriter, RecordReader + + +def test_rdump_pipe(tmp_path): + TestRecord = RecordDescriptor("test/record", [ + ("varint", "count"), + ("string", "foo"), + ]) + + path = tmp_path / "test.records" + writer = RecordWriter(path) + + for i in range(10): + writer.write(TestRecord(count=i, foo="bar")) + writer.close() + + # validate input + args = ["rdump", str(path)] + res = subprocess.Popen(args, stdout=subprocess.PIPE) + stdout, stderr = res.communicate() + assert len(stdout.splitlines()) == 10 + + # rdump test.records | wc -l + p1 = subprocess.Popen(["rdump", str(path)], stdout=subprocess.PIPE) + p2 = subprocess.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE) + stdout, stderr = p2.communicate() + assert stdout.strip() == b"10" + + # (binary) rdump test.records -w - | rdump -s 'r.count == 5' + p1 = subprocess.Popen(["rdump", str(path), "-w", "-"], stdout=subprocess.PIPE) + p2 = subprocess.Popen( + ["rdump", "-s", "r.count == 5"], stdin=p1.stdout, stdout=subprocess.PIPE, + ) + stdout, stderr = p2.communicate() + assert stdout.strip() in (b"", b"") + + # (printer) rdump test.records | rdump -s 'r.count == 5' + p1 = subprocess.Popen(["rdump", str(path)], stdout=subprocess.PIPE) + p2 = subprocess.Popen( + ["rdump", "-s", "r.count == 5"], + stdin=p1.stdout, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = p2.communicate() + assert stdout.strip() == b"" + assert b"Unknown file format, not a RecordStream" in stderr.strip() + + # rdump test.records -w - | rdump -s 'r.count in (1, 3, 9)' -w filtered.records + path2 = tmp_path / "filtered.records" + p1 = subprocess.Popen(["rdump", str(path), "-w", "-"], stdout=subprocess.PIPE) + p2 = subprocess.Popen( + ["rdump", "-s", "r.count in (1, 3, 9)", "-w", str(path2)], stdin=p1.stdout, + ) + stdout, stderr = p2.communicate() + + reader = RecordReader(path2) + records = list(reader) + assert len(records) == 3 + assert {r.count for r in records} == {1, 3, 9} + + +def test_rdump_format_template(tmp_path): + TestRecord = RecordDescriptor("test/record", [ + ("varint", "count"), + ("string", "foo"), + ]) + + path = tmp_path / "test.records" + writer = RecordWriter(path) + + # generate some test records + for i in range(10): + writer.write(TestRecord(count=i, foo="bar")) + writer.close() + + # validate output with -f + args = ["rdump", str(path), "-f", "TEST: {count},{foo}"] + print(args) + res = subprocess.Popen(args, stdout=subprocess.PIPE) + stdout, stderr = res.communicate() + for i, line in enumerate(stdout.decode().splitlines()): + assert line == "TEST: {i},bar".format(i=i) + + +def test_rdump_json(tmp_path): + TestRecord = RecordDescriptor("test/record", [ + ("varint", "count"), + ("string", "foo"), + ("bytes", "data"), + ("net.ipaddress", "ip"), + ("net.ipnetwork", "subnet"), + ("digest", "digest"), + ]) + + record_path = tmp_path / "test.records" + writer = RecordWriter(record_path) + + # generate some test records + for i in range(10): + data = str(i).encode() + md5 = hashlib.md5(data).hexdigest() + sha1 = hashlib.sha1(data).hexdigest() + sha256 = hashlib.sha256(data).hexdigest() + writer.write( + TestRecord( + count=i, + foo="bar" * i, + data=b"\x00\x01\x02\x03--" + data, + ip=u"172.16.0.{}".format(i), + subnet=u"192.168.{}.0/24".format(i), + digest=(md5, sha1, sha256), + )) + writer.close() + + # dump records as JSON lines + args = ["rdump", str(record_path), "--jsonlines"] + process = subprocess.Popen(args, stdout=subprocess.PIPE) + stdout, stderr = process.communicate() + + assert process.returncode == 0 + + # Basic validations in stdout + for i in range(10): + assert base64.b64encode("\x00\x01\x02\x03--{}".format(i).encode()) in stdout + assert u"192.168.{}.0/24".format(i).encode() in stdout + assert u"172.16.0.{}".format(i).encode() in stdout + assert ("bar" * i).encode() in stdout + + # Load json using json.loads() and validate key values + for i, line in enumerate(stdout.splitlines()): + json_dict = json.loads(line) + assert json_dict + if i == 0: + assert "_type" in json_dict + assert json_dict["_type"] == "recorddescriptor" + else: + count = i - 1 # fix offset as first line is the recorddescriptor information + data = str(count).encode() + md5 = hashlib.md5(data).hexdigest() + sha1 = hashlib.sha1(data).hexdigest() + sha256 = hashlib.sha256(data).hexdigest() + assert json_dict["count"] == count + assert json_dict["foo"] == "bar" * count + assert json_dict["data"] == base64.b64encode("\x00\x01\x02\x03--{}".format(count).encode()).decode() + assert json_dict["ip"] == u"172.16.0.{}".format(count) + assert json_dict["subnet"] == u"192.168.{}.0/24".format(count) + assert json_dict["digest"]["md5"] == md5 + assert json_dict["digest"]["sha1"] == sha1 + assert json_dict["digest"]["sha256"] == sha256 + + # Write jsonlines to file + path = tmp_path / "records.jsonl" + path.write_bytes(stdout) + json_path = "jsonfile://{}".format(path) + + # Read records from json and original records file and validate + for path in (json_path, record_path): + with RecordReader(path) as reader: + for i, record in enumerate(reader): + data = str(i).encode() + md5 = hashlib.md5(data).hexdigest() + sha1 = hashlib.sha1(data).hexdigest() + sha256 = hashlib.sha256(data).hexdigest() + assert record.count == i + assert record.ip == u"172.16.0.{}".format(i) + assert record.subnet == u"192.168.{}.0/24".format(i) + assert record.data == b"\x00\x01\x02\x03--" + data + assert record.digest.md5 == md5 + assert record.digest.sha1 == sha1 + assert record.digest.sha256 == sha256 + assert record.foo == "bar" * i diff --git a/tests/test_record.py b/tests/test_record.py new file mode 100644 index 0000000..d22a100 --- /dev/null +++ b/tests/test_record.py @@ -0,0 +1,613 @@ +import sys +import pytest +from flow.record import RECORD_VERSION +from flow.record import RecordDescriptor, RecordDescriptorError +from flow.record import RecordPacker +from flow.record import RecordWriter, RecordReader, RecordPrinter +from flow.record import Record, GroupedRecord +from flow.record import record_stream, extend_record +from flow.record import fieldtypes +from flow.record.stream import RecordFieldRewriter + +from . import utils_inspect as inspect + + +def test_record_creation(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "url"), + ("string", "query"), + ]) + + # No arguments defaults to None + r = TestRecord() + assert r.url is None + assert r.query is None + + # Keyword arguments + r = TestRecord(url="foo", query="bar") + assert r.url == "foo" + assert r.query == "bar" + + # Positional arguments + r = TestRecord("foo", "bar") + assert r.url == "foo" + assert r.query == "bar" + + # Single keyword argument + r = TestRecord(query="foo") + assert r.query == "foo" + assert r.url is None + + +def test_record_version(tmpdir): + path = "jsonfile://{}".format(tmpdir.join("test.jsonl").strpath) + writer = RecordWriter(path) + packer = RecordPacker() + TestRecord = RecordDescriptor("test/record", [ + ("string", "hello"), + ("string", "world"), + ]) + + r1 = TestRecord(hello="hello", world="world") + writer.write(r1) + data = packer.pack(r1) + u1 = packer.unpack(data) + print(repr(u1._desc)) + + assert u1.hello == r1.hello + assert u1.world == r1.world + + # change the order + TestRecord = RecordDescriptor("test/record", [ + ("string", "world"), + ("string", "hello"), + ]) + r2 = TestRecord(hello="hello", world="world") + writer.write(r2) + data = packer.pack(r2) + u2 = packer.unpack(data) + + assert u2.hello == r2.hello + assert u2.world == r2.world + print(repr(u2._desc)) + + # change fieldtypes + TestRecord = RecordDescriptor("test/record", [ + ("varint", "world"), + ("string", "hello"), + ]) + r3 = TestRecord(hello="hello", world=42) + writer.write(r3) + data = packer.pack(r3) + u3 = packer.unpack(data) + + writer.flush() + + assert u3._desc.identifier == r3._desc.identifier + assert u1._desc.identifier != u3._desc.identifier + assert u2._desc.identifier != u3._desc.identifier + assert u3.hello == r3.hello + assert u3.world == r3.world + + reader = RecordReader(path) + rec = [r for r in reader] + assert len(rec) == 3 + assert u3._desc.identifier == rec[2]._desc.identifier + assert u1._desc.identifier != rec[2]._desc.identifier + assert u2._desc.identifier != rec[2]._desc.identifier + assert u3.hello == rec[2].hello + assert u3.world == rec[2].world + + +def test_grouped_record(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "hello"), + ("string", "world"), + ("uint32", "count"), + ]) + WQMetaRecord = RecordDescriptor("wq/meta", [ + ("string", "assignee"), + ("string", "profile"), + ("string", "hello"), + ]) + + test_record = TestRecord("a", "b", 12345) + meta_record = WQMetaRecord("me", "this is a test", "other hello") + + grouped = GroupedRecord("grouped/wq", [test_record, meta_record]) + assert grouped.hello == "a" + assert grouped.world == "b" + assert grouped.count == 12345 + assert grouped.assignee == "me" + assert grouped.profile == "this is a test" + + grouped.profile = "omg" + grouped.hello = "new value" + assert grouped.hello == "new value" + assert grouped.profile == "omg" + assert grouped.records[0].hello == "new value" + assert grouped.records[1].hello == "other hello" + + grouped.records[1].hello = "testing" + assert grouped.hello != "testing" + assert grouped.hello == "new value" + assert grouped.records[1].hello == "testing" + + assert len(grouped.records) == 2 + + # test grouped._asdict + rdict = grouped._asdict() + assert set(["hello", "world", "count", "assignee", "profile", "hello"]) <= set(rdict) + + rdict = grouped._asdict(fields=["profile", "count", "_generated"]) + assert set(["profile", "count", "_generated"]) == set(rdict) + assert rdict["profile"] == "omg" + assert rdict["count"] == 12345 + + +def test_grouped_records_packing(tmpdir): + RecordA = RecordDescriptor("test/a", [ + ("string", "a_string"), + ("string", "common"), + ("uint32", "a_count"), + ]) + RecordB = RecordDescriptor("test/b", [ + ("string", "b_string"), + ("string", "common"), + ("uint32", "b_count"), + ]) + a = RecordA("hello", "world", 12345, _source="TheBadInternet", _classification="CLASSIFIED") + b = RecordB("good", "bye", 54321, _source="TheGoodInternet", _classification="TLP.WHITE") + assert isinstance(a, Record) + assert not isinstance(a, GroupedRecord) + + grouped = GroupedRecord("grouped/ab", [a, b]) + assert isinstance(grouped, (Record, GroupedRecord)) + assert [(f.typename, f.name) for f in grouped._desc.fields.values()] == [ + ("string", "a_string"), + ("string", "common"), + ("uint32", "a_count"), + ("string", "b_string"), + ("uint32", "b_count"), + ] + + path = tmpdir.join("grouped.records").strpath + writer = RecordWriter(path) + writer.write(grouped) + writer.write(grouped) + writer.write(grouped) + writer.write(grouped) + writer.write(grouped) + writer.flush() + + reader = RecordReader(path) + record = next(iter(reader)) + + # grouped record tests + assert isinstance(record, Record) + assert isinstance(record, GroupedRecord) + assert record.common == "world" # first 'key' has precendence + assert record.name == "grouped/ab" + assert record.a_string == "hello" + assert record.a_count == 12345 + assert record.b_count == 54321 + assert record.b_string == "good" + assert record._source == "TheBadInternet" + assert record._classification == "CLASSIFIED" + + # access 'common' on second record directly + assert record.records[1].common == "bye" + + # access raw records directly + assert len(record.records) == 2 + assert record.records[0]._desc.name == "test/a" + assert record.records[1]._desc.name == "test/b" + + # test using selectors + reader = RecordReader(path, selector="r.a_count == 12345") + assert len(list(iter(reader))) == 5 + + reader = RecordReader(path, selector="r.common == 'bye'") + assert len(list(iter(reader))) == 0 + reader = RecordReader(path, selector="r.common == 'world'") + assert len(list(iter(reader))) == 5 + + +def test_record_reserved_fieldname(): + with pytest.raises(RecordDescriptorError): + RecordDescriptor("test/a", [ + ("string", "_classification"), + ("string", "_source"), + ("uint32", "_generated"), + ]) + + +def test_record_printer_stdout(capsys): + Record = RecordDescriptor("test/a", [ + ("string", "a_string"), + ("string", "common"), + ("uint32", "a_count"), + ]) + record = Record("hello", "world", 10) + + # fake capsys to be a tty. + def isatty(): + return True + capsys._capture.out.tmpfile.isatty = isatty + + writer = RecordPrinter(getattr(sys.stdout, "buffer", sys.stdout)) + writer.write(record) + + out, err = capsys.readouterr() + modifier = '' if isinstance(u'', str) else 'u' + expected = "\n".format(u=modifier) + assert out == expected + + +def test_record_field_limit(): + count = 1337 + fields = [('uint32', 'field_{}'.format(i)) for i in range(count)] + values = dict([('field_{}'.format(i), i) for i in range(count)]) + + Record = RecordDescriptor("test/limit", fields) + record = Record(**values) + + for i in range(count): + assert getattr(record, 'field_{}'.format(i)) == i + + # test kwarg init + record = Record(field_404=12345) + assert record.field_404 == 12345 + assert record.field_0 is None + + # test arg init + record = Record(200, 302, 404) + assert record.field_0 == 200 + assert record.field_1 == 302 + assert record.field_2 == 404 + assert record.field_404 is None + + # test arg + kwarg init + record = Record(200, 302, 404, field_502=502) + assert record.field_0 == 200 + assert record.field_1 == 302 + assert record.field_2 == 404 + assert record.field_3 is None + assert record.field_502 == 502 + + +def test_record_internal_version(): + Record = RecordDescriptor("test/a", [ + ("string", "a_string"), + ("string", "common"), + ("uint32", "a_count"), + ]) + + record = Record("hello", "world", 10) + assert record._version == RECORD_VERSION + + record = Record("hello", "world", 10, _version=1337) + assert record._version == RECORD_VERSION + + +def test_record_reserved_keyword(): + Record = RecordDescriptor("test/a", [ + ("string", "from"), + ("string", "and"), + ("uint32", "or"), + ("uint32", "normal"), + ]) + + init = Record.recordType.__init__ + sig = inspect.signature(init) + params = list(sig.parameters.values()) + assert init.__code__.co_argcount == 1 + assert len(params) == 3 + assert params[1].name == 'args' + assert params[1].kind == params[1].VAR_POSITIONAL + assert params[2].name == 'kwargs' + assert params[2].kind == params[2].VAR_KEYWORD + + r = Record('hello', 'world', 1337, 10) + assert getattr(r, 'from') == 'hello' + assert getattr(r, 'and') == 'world' + assert getattr(r, 'or') == 1337 + assert r.normal == 10 + + r = Record('some', 'missing', normal=5) + assert getattr(r, 'from') == 'some' + assert getattr(r, 'and') == 'missing' + assert getattr(r, 'or') is None + assert r.normal == 5 + + r = Record('from_value', **{'and': 'dict', 'or': 7331, 'normal': 3}) + assert getattr(r, 'from') == 'from_value' + assert getattr(r, 'and') == 'dict' + assert getattr(r, 'or') == 7331 + assert r.normal == 3 + + Record = RecordDescriptor("test/a", [ + ("uint32", "normal"), + ]) + + init = Record.recordType.__init__ + sig = inspect.signature(init) + params = list(sig.parameters.values()) + assert init.__code__.co_argcount == 6 + assert len(params) == 6 + assert params[1].name == 'normal' + assert params[1].kind == params[1].POSITIONAL_OR_KEYWORD + assert params[1].default is None + assert params[2].name == '_source' + assert params[2].kind == params[2].POSITIONAL_OR_KEYWORD + assert params[2].default is None + assert params[3].name == '_classification' + assert params[3].kind == params[3].POSITIONAL_OR_KEYWORD + assert params[3].default is None + assert params[4].name == '_generated' + assert params[4].kind == params[4].POSITIONAL_OR_KEYWORD + assert params[4].default is None + assert params[5].name == '_version' + assert params[5].kind == params[5].POSITIONAL_OR_KEYWORD + assert params[5].default is None + + Record = RecordDescriptor("test/a", [ + ("uint32", "self"), + ("uint32", "cls"), + ]) + r = Record(1, 2) + assert r.self == 1 + assert r.cls == 2 + + +def test_record_stream(tmp_path): + Record = RecordDescriptor("test/counter", [ + ("uint32", "counter"), + ("string", "tag"), + ]) + + datasets = [ + tmp_path / "dataset1.records", + tmp_path / "dataset2.records.gz", + ] + + for ds in datasets: + writer = RecordWriter(str(ds)) + for i in range(100): + writer.write(Record(i, tag=ds.name)) + writer.close() + + datasets = [str(ds) for ds in datasets] + assert len(list(record_stream(datasets))) == len(datasets) * 100 + assert len(list(record_stream(datasets, "r.counter == 42"))) == len(datasets) + + +def test_record_replace(): + TestRecord = RecordDescriptor("test/record", [ + ("uint32", "index"), + ("string", "foo"), + ]) + + t = TestRecord(1, "hello") + assert t.index == 1 + assert t.foo == "hello" + + t2 = t._replace(foo="bar", index=1337) + assert t2.foo == "bar" + assert t2.index == 1337 + + t3 = t._replace() + assert t3.index == 1 + assert t3.foo == "hello" + assert t3._source == t._source + assert t3._generated == t._generated + assert t3._version == t._version + + t4 = t2._replace(foo="test", _source="pytest") + assert t4.index == 1337 + assert t4.foo == "test" + assert t4._source == "pytest" + assert t4._generated == t2._generated + + with pytest.raises(ValueError) as excinfo: + t._replace(foobar="keyword does not exist") + excinfo.match(".*Got unexpected field names:.*foobar.*") + + +def test_record_init_from_record(): + TestRecord = RecordDescriptor("test/record", [ + ("uint32", "index"), + ("string", "foo"), + ]) + + t = TestRecord(1, "hello") + assert t.index == 1 + assert t.foo == "hello" + + TestRecord2 = TestRecord.extend([ + ("string", "bar"), + ("uint32", "test"), + ]) + t2 = TestRecord2.init_from_record(t) + assert t2.index == 1 + assert t2.foo == "hello" + assert t2.bar is None + assert t2.test is None + + t2.bar = "bar" + t2.test = 3 + assert t2.bar == "bar" + assert t2.test == 3 + + TestRecord3 = RecordDescriptor("test/record3", [ + ("string", "test"), + ("uint32", "count"), + ]) + with pytest.raises(TypeError): + t3 = TestRecord3.init_from_record(t2, raise_unknown=True) + + # explicit raise_unknown=False + t3 = TestRecord3.init_from_record(t2, raise_unknown=False) + assert t3.test == "3" + assert t3.count is None + + # default should not raise either + t3 = TestRecord3.init_from_record(t2) + assert t3.test == "3" + assert t3.count is None + + +def test_record_asdict(): + Record = RecordDescriptor("test/a", [ + ("string", "a_string"), + ("string", "common"), + ("uint32", "a_count"), + ]) + record = Record("hello", "world", 1337) + rdict = record._asdict() + assert rdict.get("a_string") == "hello" + assert rdict.get("common") == "world" + assert rdict.get("a_count") == 1337 + assert set(rdict) == set(["a_string", "common", "a_count", "_source", "_generated", "_version", "_classification"]) + + rdict = record._asdict(fields=["common", "_source", "a_string"]) + assert set(rdict) == set(["a_string", "common", "_source"]) + + rdict = record._asdict(exclude=["a_count", "_source", "_generated", "_version"]) + assert set(rdict) == set(["a_string", "common", "_classification"]) + + rdict = record._asdict(fields=["common", "_source", "a_string"], exclude=["common"]) + assert set(rdict) == set(["a_string", "_source"]) + + +def test_recordfield_rewriter_expression(): + rewriter = RecordFieldRewriter(expression="upper_a = a_string.upper(); count_times_10 = a_count * 10") + Record = RecordDescriptor("test/a", [ + ("string", "a_string"), + ("string", "common"), + ("uint32", "a_count"), + ]) + record = Record("hello", "world", 1337) + new_record = rewriter.rewrite(record) + assert new_record.a_string == "hello" + assert new_record.common == "world" + assert new_record.a_count == 1337 + assert new_record.upper_a == "HELLO" + assert new_record.count_times_10 == 1337 * 10 + + +def test_recordfield_rewriter_fields(): + rewriter = RecordFieldRewriter(fields=["a_count"]) + Record = RecordDescriptor("test/a", [ + ("string", "a_string"), + ("string", "common"), + ("uint32", "a_count"), + ]) + record = Record("hello", "world", 1337) + new_record = rewriter.rewrite(record) + assert hasattr(new_record, "a_count") + assert not hasattr(new_record, "a_string") + assert not hasattr(new_record, "common") + + +def test_extend_record(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "url"), + ("string", "query"), + ]) + FooRecord = RecordDescriptor("test/foo", [ + ("varint", "foo"), + ("bytes", "query"), + ("bytes", "bar"), + ]) + HelloRecord = RecordDescriptor("test/hello", [ + ("string", "hello"), + ("string", "world"), + ("string", "url"), + ]) + + a = TestRecord("http://flow.record", "myquery") + b = FooRecord(12345, b"FOO", b"BAR") + c = HelloRecord("hello", "world", "http://hello.world") + + new = extend_record(a, [b, c]) + assert new._desc == RecordDescriptor("test/record", [ + ("string", "url"), + ("string", "query"), + ("varint", "foo"), + ("bytes", "bar"), + ("string", "hello"), + ("string", "world"), + ]) + assert new.url == "http://flow.record" + assert new.query == "myquery" + assert new.foo == 12345 + assert new.bar == b"BAR" + assert new.hello == "hello" + assert new.world == "world" + + new = extend_record(a, [b, c], replace=True) + assert new._desc == RecordDescriptor("test/record", [ + ("string", "url"), + ("bytes", "query"), + ("varint", "foo"), + ("bytes", "bar"), + ("string", "hello"), + ("string", "world"), + ]) + assert new.url == "http://hello.world" + assert new.query == b"FOO" + assert new.foo == 12345 + assert new.bar == b"BAR" + assert new.hello == "hello" + assert new.world == "world" + + +def test_extend_record_with_replace(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "ip"), + ("uint16", "port"), + ("string", "data"), + ("string", "note"), + ]) + ReplaceRecord = RecordDescriptor("test/foo", [ + ("net.ipaddress", "ip"), + ("net.tcp.Port", "port"), + ("bytes", "data"), + ("string", "location"), + ]) + + a = TestRecord("10.13.13.17", 80, "HTTP/1.1 200 OK\r\n", "webserver") + b = ReplaceRecord( + ip=a.ip, + port=a.port, + data=a.data.encode(), + location="DMZ", + ) + new = extend_record(a, [b], replace=False) + assert new.ip == "10.13.13.17" + assert new.port == 80 + assert new.data == "HTTP/1.1 200 OK\r\n" + assert new.note == "webserver" + assert new.location == "DMZ" + assert isinstance(new.ip, str) + assert isinstance(new.port, int) + assert isinstance(new.data, str) + assert isinstance(new.note, str) + assert isinstance(new.location, str) + assert new._desc.name == "test/record" + assert " len(before) + assert len(before) == 3 + assert len(after) == 6 + + +def test_record_archiver(tmpdir): + TestRecord = RecordDescriptor("test/record", [ + ("uint32", "id"), + ]) + + records = [ + TestRecord(id=1, _generated=datetime.datetime(2017, 12, 6, 22, 10)), + TestRecord(id=2, _generated=datetime.datetime(2017, 12, 6, 23, 59)), + TestRecord(id=3, _generated=datetime.datetime(2017, 12, 7, 00, 00)), + ] + + p = tmpdir.mkdir("test") + + writer = RecordArchiver(p, name="archive-test") + for rec in records: + writer.write(rec) + writer.close() + + assert p.join("2017/12/06").check(dir=1) + assert p.join("2017/12/07").check(dir=1) + + assert p.join("2017/12/06/archive-test-20171206T22.records.gz").check(file=1) + assert p.join("2017/12/06/archive-test-20171206T23.records.gz").check(file=1) + assert p.join("2017/12/07/archive-test-20171207T00.records.gz").check(file=1) + + # test archiving + before = p.join("2017/12/06").listdir() + writer = RecordArchiver(p, name="archive-test") + for rec in records: + writer.write(rec) + writer.close() + after = p.join("2017/12/06").listdir() + + assert set(before).issubset(set(after)) + assert len(after) > len(before) + assert len(before) == 2 + assert len(after) == 4 + + +def test_record_writer_stdout(): + writer = RecordWriter() + assert writer.fp == getattr(sys.stdout, "buffer", sys.stdout) + + writer = RecordWriter(None) + assert writer.fp == getattr(sys.stdout, "buffer", sys.stdout) + + writer = RecordWriter("") + assert writer.fp == getattr(sys.stdout, "buffer", sys.stdout) + + # We cannot test RecordReader() because it will read from stdin during init + # reader = RecordReader() + # assert reader.fp == sys.stdin + + +def test_record_adapter_archive(tmpdir): + # archive some records, using "testing" as name + writer = RecordWriter("archive://{}?name=testing".format(tmpdir)) + dt = datetime.datetime.utcnow() + count = 0 + for rec in generate_records(): + writer.write(rec) + count += 1 + writer.close() + + # defaults to always archive by /YEAR/MONTH/DAY/ dir structure + outdir = tmpdir.join("{ts:%Y/%m/%d}".format(ts=dt)) + assert len(outdir.listdir()) + + # read the archived records and test filename and counts + count2 = 0 + for fname in outdir.listdir(): + assert fname.basename.startswith("testing-") + for rec in RecordReader(str(fname)): + count2 += 1 + assert count == count2 + + +def test_record_pathlib(tmp_path): + # Test support for Pathlib/PathLike objects + writer = RecordWriter(tmp_path / "test.records") + for rec in generate_records(100): + writer.write(rec) + writer.close() + + reader = RecordReader(tmp_path / "test.records") + assert len([rec for rec in reader]) == 100 + assert not isinstance(tmp_path / "test.records", str) + + +def test_record_pathlib_contextmanager(tmp_path): + with RecordWriter(tmp_path / "test.records") as writer: + for rec in generate_records(100): + writer.write(rec) + + with RecordReader(tmp_path / "test.records") as reader: + assert len([rec for rec in reader]) == 100 + assert not isinstance(tmp_path / "test.records", str) + + +def test_record_pathlib_contextmanager_double_close(tmp_path): + with RecordWriter(tmp_path / "test.records") as writer: + for rec in generate_records(100): + writer.write(rec) + writer.close() + + with RecordReader(tmp_path / "test.records") as reader: + assert len([rec for rec in reader]) == 100 + reader.close() + + +def test_record_invalid_recordstream(tmp_path): + path = str(tmp_path / "invalid_records") + with open(path, "wb") as f: + f.write(b"INVALID RECORD STREAM FILE") + + with pytest.raises(IOError): + with RecordReader(path) as reader: + for r in reader: + assert(r) + + +@pytest.mark.parametrize("adapter,contains", [ + ("csvfile", (b"5,hello,world", b"count,foo,bar,")), + ("jsonfile", (b'"count": 5', )), + ("text", (b"count=5", )), + ("line", (b"count = 5", b"--[ RECORD 5 ]--")), +]) +def test_record_adapter(adapter, contains, tmp_path): + TestRecord = RecordDescriptor("test/record", [ + ("uint32", "count"), + ("string", "foo"), + ("string", "bar"), + ]) + + # construct the RecordWriter with uri + path = tmp_path / "output" + uri = "{adapter}://{path!s}".format(adapter=adapter, path=path) + + # test parametrized contains + with RecordWriter(uri) as writer: + for i in range(10): + rec = TestRecord(count=i, foo="hello", bar="world") + writer.write(rec) + for pattern in contains: + assert pattern in path.read_bytes() + + # test include (excludes everything else except in include) + with RecordWriter("{}?fields=count".format(uri)) as writer: + for i in range(10): + rec = TestRecord(count=i, foo="hello", bar="world") + writer.write(rec) + + # test exclude + with RecordWriter("{}?exclude=count".format(uri)) as writer: + for i in range(10): + rec = TestRecord(count=i, foo="hello", bar="world") + writer.write(rec) + + +def test_text_record_adapter(capsys): + TestRecordWithFooBar = RecordDescriptor("test/record", [ + ("string", "name"), + ("string", "foo"), + ("string", "bar"), + ]) + TestRecordWithoutFooBar = RecordDescriptor("test/record2", [ + ("string", "name"), + ]) + format_spec = "Hello {name}, {foo} is {bar}!" + with RecordWriter(f"text://?format_spec={format_spec}") as writer: + # Format string with existing variables + rec = TestRecordWithFooBar(name="world", foo="foo", bar="bar") + writer.write(rec) + out, err = capsys.readouterr() + assert "Hello world, foo is bar!\n" == out + + # Format string with non-existing variables + rec = TestRecordWithoutFooBar(name="planet") + writer.write(rec) + out, err = capsys.readouterr() + assert "Hello planet, {foo} is {bar}!\n" == out + + +def test_recordstream_header(tmp_path): + # Create and delete a RecordWriter, with nothing happening + p = tmp_path / "out.records" + writer = RecordWriter(p) + del(writer) + assert p.read_bytes() == b"" + + # RecordWriter via context manager, always flushes and closes afterwards + p = tmp_path / "out2.records" + with RecordWriter(p) as writer: + pass + assert p.read_bytes() == b"\x00\x00\x00\x0f\xc4\rRECORDSTREAM\n" + + # Manual create of RecordWriter with no records and close (no flush) + p = tmp_path / "out3.records" + writer = RecordWriter(p) + writer.close() + assert p.read_bytes() == b"" + + # Manual RecordWriter with no records but flush and close + p = tmp_path / "out3.records" + writer = RecordWriter(p) + writer.flush() + writer.close() + assert p.read_bytes() == b"\x00\x00\x00\x0f\xc4\rRECORDSTREAM\n" + + # Manual RecordWriter with some records written, we flush to ensure output due to buffering + p = tmp_path / "out4.records" + writer = RecordWriter(p) + writer.write(next(generate_records())) + writer.flush() + del(writer) + assert p.read_bytes().startswith(b"\x00\x00\x00\x0f\xc4\rRECORDSTREAM\n") + + +def test_recordstream_header_stdout(capsysbinary): + with RecordWriter() as writer: + pass + out, err = capsysbinary.readouterr() + assert out == b"\x00\x00\x00\x0f\xc4\rRECORDSTREAM\n" + + writer = RecordWriter() + del(writer) + out, err = capsysbinary.readouterr() + assert out == b"" + + writer = RecordWriter() + writer.close() + out, err = capsysbinary.readouterr() + assert out == b"" + + writer = RecordWriter() + writer.flush() + writer.close() + out, err = capsysbinary.readouterr() + assert out == b"\x00\x00\x00\x0f\xc4\rRECORDSTREAM\n" diff --git a/tests/test_record_descriptor.py b/tests/test_record_descriptor.py new file mode 100644 index 0000000..e9fde2b --- /dev/null +++ b/tests/test_record_descriptor.py @@ -0,0 +1,142 @@ +import struct +import hashlib + +from flow.record import RecordDescriptor +from flow.record import RecordField + + +def test_record_descriptor(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "url"), + ("string", "query"), + ("varint", "status"), + ]) + + # Get fields of type string + fields = TestRecord.getfields("string") + assert isinstance(fields, list) + assert len(fields) == 2 + assert isinstance(fields[0], RecordField) + assert fields[0].typename == "string" + assert fields[0].name == "url" + + # Get fields as tuples + fields = TestRecord.get_field_tuples() + assert isinstance(fields, tuple) + assert len(fields) == 3 + assert isinstance(fields[0], tuple) + assert fields[0][0] == "string" + assert fields[0][1] == "url" + + +def test_record_descriptor_clone(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "url"), + ("string", "query"), + ("varint", "status"), + ]) + + # Clone record descriptor + OtherRecord = RecordDescriptor("other/record", TestRecord) + + assert TestRecord.name == "test/record" + assert OtherRecord.name == "other/record" + assert TestRecord.descriptor_hash != OtherRecord.descriptor_hash + assert TestRecord.get_field_tuples() == OtherRecord.get_field_tuples() + + +def test_record_descriptor_extend(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "url"), + ("string", "query"), + ]) + + # Add field + ExtendedRecord = TestRecord.extend([("varint", "status")]) + + assert TestRecord.name == "test/record" + assert ExtendedRecord.name == "test/record" + assert TestRecord.descriptor_hash != ExtendedRecord.descriptor_hash + assert len(TestRecord.get_field_tuples()) == 2 + assert len(ExtendedRecord.get_field_tuples()) == 3 + + +def test_record_descriptor_hash_cache(): + # Get initial cache stats + TestRecord1 = RecordDescriptor("test/record", [ + ("string", "url"), + ("string", "query"), + ]) + info = RecordDescriptor.calc_descriptor_hash.cache_info() + + # Create same descriptor, check cache hit increase + TestRecord2 = RecordDescriptor("test/record", [ + ("string", "url"), + ("string", "query"), + ]) + info2 = RecordDescriptor.calc_descriptor_hash.cache_info() + assert info2.hits == info.hits + 1 + assert info.misses == info2.misses + assert TestRecord1.descriptor_hash == TestRecord2.descriptor_hash + + # Create different descriptor, check for cache miss increase + TestRecord3 = RecordDescriptor("test/record", [ + ("string", "url"), + ("string", "query"), + ("boolean", "test"), + ]) + info3 = RecordDescriptor.calc_descriptor_hash.cache_info() + assert info2.hits == info.hits + 1 + assert info3.misses == info.misses + 1 + assert TestRecord2.descriptor_hash != TestRecord3.descriptor_hash + + +def test_record_descriptor_hashing(): + """ Test if hashing is still consistent to keep compatibility """ + TestRecord = RecordDescriptor("test/hash", [ + ("boolean", "one"), + ("string", "two"), + ]) + + # known good values from flow.record version 1.4.1 + desc_hash = 1395243447 + desc_bytes = b"test/hashonebooleantwostring" + + # calculate + hash_digest = struct.unpack(">L", hashlib.sha256(desc_bytes).digest()[:4])[0] + assert desc_hash == hash_digest + + # verify current implementation + assert TestRecord.descriptor_hash == hash_digest + + +def test_record_descriptor_hash_eq(): + """ Tests __hash__() on RecordDescriptor """ + TestRecordSame1 = RecordDescriptor("test/same", [ + ("boolean", "one"), + ("string", "two"), + ]) + + TestRecordSame2 = RecordDescriptor("test/same", [ + ("boolean", "one"), + ("string", "two"), + ]) + + TestRecordDifferentName = RecordDescriptor("test/different", [ + ("boolean", "one"), + ("string", "two"), + ]) + + TestRecordDifferentFields = RecordDescriptor("test/different", [ + ("varint", "one"), + ("float", "two"), + ]) + + # __hash__ + assert hash(TestRecordSame1) == hash(TestRecordSame2) + assert hash(TestRecordSame1) != hash(TestRecordDifferentName) + + # __eq__ + assert TestRecordSame1 == TestRecordSame2 + assert TestRecordSame1 != TestRecordDifferentName + assert TestRecordDifferentName != TestRecordDifferentFields diff --git a/tests/test_regression.py b/tests/test_regression.py new file mode 100644 index 0000000..d1c9ea4 --- /dev/null +++ b/tests/test_regression.py @@ -0,0 +1,376 @@ +import pytest +import codecs +import os +import datetime +import sys + +import msgpack + +from flow.record import ( + base, + whitelist, + fieldtypes, + Record, + GroupedRecord, + RecordDescriptor, + RecordPacker, + RECORD_VERSION, + RecordReader, + RecordWriter, +) +from flow.record.base import is_valid_field_name +from flow.record.packer import RECORD_PACK_EXT_TYPE, RECORD_PACK_TYPE_RECORD +from flow.record.selector import Selector, CompiledSelector + + +def test_datetime_serialization(): + packer = RecordPacker() + + now = datetime.datetime.utcnow() + + for tz in ["UTC", "Europe/Amsterdam"]: + os.environ["TZ"] = tz + + descriptor = RecordDescriptor(""" +test/datetime + datetime datetime; +""") + + record = descriptor.recordType(datetime=now) + data = packer.pack(record) + r = packer.unpack(data) + + assert r.datetime == now + + +def test_long_int_serialization(): + packer = RecordPacker() + + long_types = RecordDescriptor(""" +test/long_types + varint long_type; + varint int_type; + varint long_type_neg; + varint int_type_neg; + varint max_int_as_long; + """) + + l = 1239812398217398127398217389217389217398271398217321 # noqa: E741 + i = 888888 + lneg = -3239812398217398127398217389217389217398271398217321 + ineg = -988888 + max_int_as_long = sys.maxsize + + record = long_types(l, i, lneg, ineg, max_int_as_long) + data = packer.pack(record) + r = packer.unpack(data) + + assert r.long_type == l + assert r.int_type == i + assert r.long_type_neg == lneg + assert r.int_type_neg == ineg + assert r.max_int_as_long == max_int_as_long + + +def test_unicode_serialization(): + packer = RecordPacker() + + descriptor = RecordDescriptor(""" +test/unicode + string text; +""") + + puny_domains = [b'xn--s7y.co', b'xn--80ak6aa92e.com', b'xn--pple-43d.com'] + + for p in puny_domains: + domain = codecs.decode(p, "idna") + record = descriptor.recordType(text=domain) + d = packer.pack(record) + record2 = packer.unpack(d) + + assert record.text == record2.text + assert record.text == domain + + +def test_pack_long_int_serialization(): + packer = RecordPacker() + # test if 'long int' that fit in the 'int' type would be packed as int internally + + max_neg_int = -0x8000000000000000 + d = packer.pack([1234, 123456, max_neg_int, sys.maxsize]) + assert d == b'\x94\xcd\x04\xd2\xce\x00\x01\xe2@\xd3\x80\x00\x00\x00\x00\x00\x00\x00\xcf\x7f\xff\xff\xff\xff\xff\xff\xff' # noqa: E501 + + +def test_non_existing_field(): + # RecordDescriptor that is used to test locally in the Broker client + TestRecord = RecordDescriptor("test/record", [ + ("string", "text"), + ]) + x = TestRecord(text="Fox-IT, For a More Secure Society") + + # r.content does not exist in the RecordDescriptor + assert Selector('lower("Fox-IT") in lower(r.content)').match(x) is False + assert Selector('"Fox-IT" in r.content').match(x) is False + # because the field does not exist, it will still evaluate to False even for negative matches + assert Selector('"Fox-IT" not in r.content').match(x) is False + assert Selector('"Fox-IT" in r.content').match(x) is False + assert Selector('"Fox-IT" != r.content').match(x) is False + assert Selector('"Fox-IT" == r.content').match(x) is False + assert Selector('r.content == "Fox-IT, For a More Secure Society"').match(x) is False + assert Selector('r.content != "Fox-IT, For a More Secure Society"').match(x) is False + assert Selector('r.content in "Fox-IT, For a More Secure Society!"').match(x) is False + assert Selector('r.content not in "Fox-IT, For a More Secure Society!"').match(x) is False + + # r.text exist in the RecordDescriptor + assert Selector('"fox-it" in lower(r.text)').match(x) + assert Selector('r.text in "Fox-IT, For a More Secure Society!!"').match(x) + assert Selector('r.text == "Fox-IT, For a More Secure Society"').match(x) + assert Selector('r.text != "Fox-IT"').match(x) + assert Selector('lower("SECURE") in lower(r.text)').match(x) + assert Selector('"f0x-1t" not in lower(r.text)').match(x) + assert Selector('lower("NOT SECURE") not in lower(r.text)').match(x) + + +def test_set_field_type(): + TestRecord = RecordDescriptor("test/record", [ + ("uint32", "value"), + ]) + + r = TestRecord(1) + + assert isinstance(r.value, fieldtypes.uint32) + r.value = 2 + assert isinstance(r.value, fieldtypes.uint32) + + with pytest.raises(ValueError): + r.value = 'lalala' + r.value = 2 + + r = TestRecord() + assert r.value is None + r.value = 1234 + assert r.value == 1234 + with pytest.raises(TypeError): + r.value = [1, 2, 3, 4, 5] + + +def test_packer_unpacker_none_values(): + """Tests packing and unpacking of Empty records (default values of None).""" + packer = RecordPacker() + + # construct field types from all available fieldtypes + field_tuples = [] + for typename in whitelist.WHITELIST: + fieldname = "field_{}".format(typename.replace(".", "_").lower()) + field_tuples.append((typename, fieldname)) + + # create a TestRecord descriptor containing all the fieldtypes + TestRecord = RecordDescriptor("test/empty_record", field_tuples) + + # initialize an Empty record and serialize/deserialize + record = TestRecord() + data = packer.pack(record) + r = packer.unpack(data) + assert isinstance(r, Record) + + +def test_fieldname_regression(): + TestRecord = RecordDescriptor("test/uri_typed", [ + ("string", "fieldname"), + ]) + rec = TestRecord('omg regression') + + assert rec in Selector("r.fieldname == 'omg regression'") + + with pytest.raises(AttributeError): + assert rec not in Selector("fieldname == 'omg regression'") + + +def test_version_field_regression(): + packer = RecordPacker() + TestRecord = RecordDescriptor("test/record", [ + ("uint32", "value"), + ]) + + r = TestRecord(1) + + assert r.__slots__[-1] == '_version' + + r._version = 256 + data = packer.pack(r) + with pytest.warns(RuntimeWarning) as record: + packer.unpack(data) + + assert len(record) == 1 + assert record[0].message.args[0].startswith("Got old style record with no version information") + + r._version = RECORD_VERSION + 1 if RECORD_VERSION < 255 else RECORD_VERSION - 1 + data = packer.pack(r) + with pytest.warns(RuntimeWarning) as record: + packer.unpack(data) + + assert len(record) == 1 + assert record[0].message.args[0].startswith("Got other version record") + + +def test_reserved_field_count_regression(): + del base.RESERVED_FIELDS['_version'] + base.RESERVED_FIELDS['_extra'] = 'varint' + base.RESERVED_FIELDS['_version'] = 'varint' + + TestRecordExtra = RecordDescriptor("test/record", [ + ("uint32", "value"), + ]) + + del base.RESERVED_FIELDS['_extra'] + + TestRecordBase = RecordDescriptor("test/record", [ + ("uint32", "value"), + ]) + + packer = RecordPacker() + r = TestRecordExtra(1, _extra=1337) + + assert r.value == 1 + assert r._extra == 1337 + + data = packer.pack(r) + packer.register(TestRecordBase) + + unpacked = packer.unpack(data) + + with pytest.raises(AttributeError): + unpacked._extra + + assert unpacked.value == 1 + assert unpacked._version == 1 + + +def test_no_version_field_regression(): + # Emulate old style record + packer = RecordPacker() + TestRecord = RecordDescriptor("test/record", [ + ("uint32", "value"), + ]) + packer.register(TestRecord) + + r = TestRecord(1) + + packed = r._pack() + mod = (packed[0], packed[1][:-1]) # Strip version field + rdata = packer.pack((RECORD_PACK_TYPE_RECORD, mod)) + data = packer.pack(msgpack.ExtType(RECORD_PACK_EXT_TYPE, rdata)) + + with pytest.warns(RuntimeWarning) as record: + unpacked = packer.unpack(data) + + assert len(record) == 1 + assert record[0].message.args[0].startswith("Got old style record with no version information") + + assert unpacked.value == 1 + assert unpacked._version == 1 # Version field implicitly added + + +def test_mixed_case_name(): + assert is_valid_field_name("Test") + assert is_valid_field_name("test") + assert is_valid_field_name("TEST") + + TestRecord = RecordDescriptor("Test/Record", [ + ("uint32", "Value"), + ]) + + r = TestRecord(1) + assert r.Value == 1 + + +def test_multi_grouped_record_serialization(tmp_path): + TestRecord = RecordDescriptor("Test/Record", [ + ("net.ipv4.Address", "ip"), + ]) + GeoRecord = RecordDescriptor("geoip/country", [ + ("string", "country"), + ("string", "city"), + ]) + ASNRecord = RecordDescriptor("geoip/asn", [ + ("string", "asn"), + ("string", "isp"), + ]) + + test_rec = TestRecord("1.3.3.7") + geo_rec = GeoRecord(country="Netherlands", city="Delft") + + grouped_rec = GroupedRecord("grouped/geoip", [test_rec, geo_rec]) + asn_rec = ASNRecord(asn="1337", isp="Cyberspace") + record = GroupedRecord("grouped/geo/asn", [grouped_rec, asn_rec]) + + assert record.ip == "1.3.3.7" + assert record.country == "Netherlands" + assert record.city == "Delft" + assert record.asn == "1337" + assert record.isp == "Cyberspace" + + writer = RecordWriter(tmp_path / "out.record") + writer.write(record) + writer.close() + + reader = RecordReader(tmp_path / "out.record") + records = list(reader) + assert len(records) == 1 + record = records[0] + assert record.ip == "1.3.3.7" + assert record.country == "Netherlands" + assert record.city == "Delft" + assert record.asn == "1337" + assert record.isp == "Cyberspace" + + +@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector]) +def test_ast_unicode_literals(PSelector): + TestRecord = RecordDescriptor("Test/Record", []) + assert TestRecord() in PSelector("get_type('string literal') == get_type(u'hello')") + assert TestRecord() in PSelector("get_type('not bytes') != get_type(b'hello')") + + +def test_grouped_replace(): + TestRecord = RecordDescriptor("test/adapter", [ + ("uint32", "number"), + ]) + OtherRecord = RecordDescriptor("test/other", [ + ("string", "other"), + ]) + + # Constructing grouped record normally + record = TestRecord(number=1) + other_record = OtherRecord("foobar") + grouped_record = GroupedRecord("grouped/original", [record, other_record]) + assert(grouped_record._source is None) + assert(grouped_record.number == 1) + assert(grouped_record.other == "foobar") + + # Constructing grouped record normally (using a replaced record) + replaced_record = record._replace(_source="newsource") + grouped_record = GroupedRecord("grouped/replaced", [replaced_record, other_record]) + assert(grouped_record._source == "newsource") + assert(grouped_record.number == 1) + assert(grouped_record.other == "foobar") + + # Test GroupedRecord replace + replaced_grouped_record = grouped_record._replace(number=100) + assert(replaced_grouped_record.number == 100) + assert(replaced_grouped_record.other == "foobar") + + # Test with multiple replacements + replaced_grouped_record = grouped_record._replace(number=200, other="a string", _source="testcase") + assert(replaced_grouped_record.number == 200) + assert(replaced_grouped_record.other == "a string") + assert(replaced_grouped_record._source == "testcase") + + # Replacement with non existing field should raise a ValueError + with pytest.raises(ValueError) as excinfo: + grouped_record._replace(number=100, other="changed", non_existing_field="oops") + excinfo.match(".*Got unexpected field names:.*non_existing_field.*") + + +if __name__ == "__main__": + __import__("standalone_test").main(globals()) diff --git a/tests/test_selector.py b/tests/test_selector.py new file mode 100644 index 0000000..8a9fda1 --- /dev/null +++ b/tests/test_selector.py @@ -0,0 +1,504 @@ +from datetime import datetime + +import pytest + +from flow.record import RecordDescriptor +from flow.record.selector import CompiledSelector, InvalidOperation, Selector + + +def test_selector_func_name(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "query"), + ("string", "url"), + ]) + assert TestRecord(None, None) not in Selector("name(r) == 'foo/bar'") + assert TestRecord(None, None) in Selector("name(r) == 'test/record'") + + +def test_selector(): + TestRecord = RecordDescriptor("test/record", [ + ("string", "query"), + ("string", "url"), + ]) + TestRecord2 = RecordDescriptor("test/record2", [ + ("string", "key"), + ("string", "content"), + ]) + + assert TestRecord("foo", "bar") in Selector("r.query == 'foo'") + assert TestRecord(None, None) not in Selector("r.query == 'foo'") + assert TestRecord(None, None) not in Selector("name(r.query) == 'XX'") + + with pytest.raises(InvalidOperation): + assert TestRecord(None, None) not in Selector("r.__class__ == 'str'") + + s = Selector("lower(upper(r.content)) == 'xx'") + assert TestRecord("XX", "XX") not in s + assert TestRecord2("XX", "XX") in s + + assert TestRecord(None, "BAR") in Selector( + "lower(r.query) == 'test' or lower(r.adsadsa) == 't' or lower(r.url) == 'bar'") + + with pytest.raises(InvalidOperation): + assert TestRecord() in Selector("invalid_func(r.invalid_field, 1337) or r.id == 4") + + +def test_selector_meta_query_true(): + source = "internal/flow.record.test" + + desc = RecordDescriptor("test/record", [ + ("string", "value"), + ]) + rec = desc("value", _source=source) + assert rec in Selector("r._source == '{}'".format(source)) + + +def test_selector_meta_query_false(): + source = "internal/flow.record.test" + + desc = RecordDescriptor("test/record", [ + ("string", "value"), + ]) + rec = desc("value", _source=source + "nope") + assert (rec in Selector("r._source == '{}'".format(source))) is False + + +def test_selector_basic_query_true(): + md5hash = "My MD5 hash!" + + desc = RecordDescriptor("test/md5_hash", [ + ("string", "md5"), + ]) + rec = desc(md5hash) + assert rec in Selector("r.md5 == '{}'".format(md5hash)) + + +def test_selector_basic_query_false(): + md5hash = "My MD5 hash!" + + desc = RecordDescriptor("test/md5_hash", [ + ("string", "md5"), + ]) + rec = desc(md5hash + "nope") + assert (rec in Selector("r.md5 == '{}'".format(md5hash))) is False + + +def test_selector_non_existing_field(): + md5hash = "My MD5 hash!" + + desc = RecordDescriptor("test/md5_hash", [ + ("string", "md5"), + ]) + rec = desc(md5hash) + assert (rec in Selector("r.non_existing_field == 1337")) is False + + +# [MS] Disabled, list types? +# def test_selector_string_in_array(): +# obj = Expando() +# obj.filenames = ['record_mitchel_keystrokes.exe', 'python.exe', 'chrome.exe'] + +# s = Selector("'{}' in r.filenames".format(obj.filenames[0])) +# assert (obj in s) is True + + +def test_selector_string_contains(): + desc = RecordDescriptor("test/filetype", [ + ("string", "filetype"), + ]) + rec = desc('PE32 executable (GUI) Intel 80386, for MS Windows') + + assert rec in Selector("'PE' in r.filetype") + + +def test_selector_not_in_operator(): + desc = RecordDescriptor("test/md5_hash", [ + ("string", "filetype"), + ]) + rec = desc('PE32 executable (GUI) Intel 80386, for MS Windows') + + assert rec in Selector("'ELF' not in r.filetype") + + +def test_selector_or_operator(): + desc = RecordDescriptor("test/filetype", [ + ("string", "filetype"), + ]) + rec = desc('PE32 executable (GUI) Intel 80386, for MS Windows') + + assert rec in Selector("'PE32' in r.filetype or 'PE64' in r.xxxx") + + +def test_selector_and_operator(): + desc = RecordDescriptor("test/filetype", [ + ("string", "filetype"), + ("string", "xxxx"), + ]) + + rec = desc('PE32 executable (GUI) Intel 80386, for MS Windows', 'PE32 executable (GUI) Intel 80386, for MS Windows') + + assert rec in Selector("'PE32' in r.filetype and 'PE32' in r.xxxx") + + +def test_selector_in_function(): + desc = RecordDescriptor("test/filetype", [ + ("string", "filetype"), + ]) + rec = desc('PE32 executable (GUI) Intel 80386, for MS Windows') + + assert rec in Selector("'pe' in lower(r.filetype)") + + +def test_selector_function_call_whitelisting(): + TestRecord = RecordDescriptor("test/filetype", [ + ("string", "filetype"), + ]) + rec = TestRecord('PE32 executable (GUI) Intel 80386, for MS Windows') + + # We allow explicitly exposed functions + assert rec in Selector("'pe32' in lower(r.filetype)") + # But functions on types are not + with pytest.raises(Exception) as excinfo: + rec in Selector("'pe' in r.filetype.lower()") + + assert rec in Selector("'EXECUTABLE' in upper(r.filetype)") + with pytest.raises(Exception) as excinfo: + rec in Selector("'EXECUTABLE' in r.filetype.upper()") + + IPRecord = RecordDescriptor("test/address", [ + ("net.ipv4.Address", "ip"), + ]) + rec = IPRecord("192.168.1.1") + assert rec in Selector("r.ip in net.ipv4.Subnet('192.168.1.0/24')") + assert rec not in Selector("r.non_existing_field in net.ipv4.Subnet('192.168.1.0/24')") + + # We call net.ipv4 instead of net.ipv4.Subnet, which should fail + with pytest.raises(Exception) as excinfo: + assert rec in Selector("r.ip in net.ipv4('192.168.1.0/24')") + excinfo.match("Call 'net.ipv4' not allowed. No calls other then whitelisted 'global' calls allowed!") + + +def test_selector_subnet(): + desc = RecordDescriptor("test/ip", [ + ("net.ipv4.Address", "ip"), + ]) + rec = desc('192.168.10.1') + + assert rec in Selector("r.ip in net.ipv4.Subnet('192.168.10.1/32')") + assert rec in Selector("r.ip in net.ipv4.Subnet('192.168.10.0/24')") + assert rec in Selector("r.ip in net.ipv4.Subnet('192.168.0.0/16')") + assert rec in Selector("r.ip in net.ipv4.Subnet('192.0.0.0/8')") + assert rec in Selector("r.ip in net.ipv4.Subnet('192.168.10.1')") + assert rec in Selector("r.ip not in net.ipv4.Subnet('10.0.0.0/8')") + + +def test_field_equals(): + desc = RecordDescriptor("test/record", [ + ("string", "mailfrom"), + ("string", "mailto"), + ("string", "foo"), + ]) + rec = desc("hello@world.com", "foo@bar.com", "testing") + assert rec in CompiledSelector("field_equals(r, ['mailfrom', 'mailto'], ['hello@world.com',])") + assert rec in CompiledSelector("field_equals(r, ['mailfrom', 'mailto'], ['hElLo@WoRlD.com',])") + assert rec not in CompiledSelector("field_equals(r, ['mailfrom', 'mailto'], ['hElLo@WoRlD.com',], nocase=False)") + assert rec not in CompiledSelector("field_equals(r, ['mailfrom', 'mailto'], ['hello',])") + + +def test_field_contains(): + desc = RecordDescriptor("test/record", [ + ("string", "mailfrom"), + ("string", "mailto"), + ("string", "foo"), + ]) + rec = desc("hello@world.com", "foo@bar.com", "testing") + rec2 = desc("hello@world.com", "foo@bar.com") + + assert rec in CompiledSelector("field_contains(r, ['mailfrom', 'mailto'], ['foo@bar.com', 'test@fox-it.com'])") + assert rec in CompiledSelector("field_contains(r, ['mailfrom', 'mailto'], ['FOO', 'HELLO'])") + assert rec in Selector("field_contains(r, ['mailfrom', 'mailto'], ['FOO', 'HELLO'])") + assert rec2 not in CompiledSelector("field_contains(r, ['testing'], ['TEST@fox-it.com'])") + + +def test_field_contains_word_boundary(): + desc = RecordDescriptor("test/record", [ + ("string", "mailfrom"), + ("string", "mailto"), + ("string", "foo"), + ("string", "content"), + ]) + rec = desc("hello@world.com", "foo@bar.com", "testing", "This is a testing string") + rec2 = desc("helloworld@world.com", "foo@bar.com") + rec3 = desc(None, None) + rec4 = desc(None, None, "hello@world.com") + rec5 = desc() + assert rec in Selector( + "field_contains(r, ['mailfrom', 'mailto'], ['hello'], word_boundary=True)") + assert rec not in Selector( + "field_contains(r, ['mailfrom', 'mailto'], ['hello.'], word_boundary=True)") # Check regex escaping... + assert rec not in Selector( + "field_contains(r, ['mailfrom', 'mailto'], ['HELLO'], nocase=False, word_boundary=True)") + assert rec2 not in Selector( + "field_contains(r, ['mailfrom', 'mailto'], ['hello'], word_boundary=True)") + assert rec2 not in Selector( + "field_contains(r, ['mailfrom', 'mailto', 'nonexistingfield'], ['hello'], word_boundary=True)") + assert rec3 not in Selector( + "field_contains(r, ['mailfrom', 'mailto'], ['hello'], word_boundary=True)") + assert rec4 in Selector( + "field_contains(r, ['mailfrom', 'mailto', 'foo'], ['hello'], word_boundary=True)") + assert rec5 not in Selector( + "field_contains(r, ['mailfrom', 'mailto', 'foo'], ['hello'], word_boundary=True)") + + assert rec not in Selector("field_contains(r, ['content'], ['sting'], word_boundary=True)") + assert rec in Selector("field_contains(r, ['content'], ['testing'], word_boundary=True)") + + +def test_field_regex(): + desc = RecordDescriptor("test/record", [ + ("string", "mailfrom"), + ("string", "mailto"), + ("string", "foo"), + ]) + rec = desc("hello@world.com", "foo@bar.com", "testing") + + assert rec in Selector(r"field_regex(r, ['mailfrom', 'mailto'], r'.+@.+\.com')") + assert rec in CompiledSelector(r"field_regex(r, ['mailfrom', 'mailto'], r'.+@.+\.com')") + assert rec not in Selector("field_regex(r, ['mailfrom', 'mailto'], r'.+@fox-it.com')") + assert rec not in CompiledSelector("field_regex(r, ['mailfrom', 'mailto'], r'.+@fox-it.com')") + + +def test_selector_uri(): + TestRecord = RecordDescriptor("test/uri", [ + ("uri", "uri"), + ]) + rec = TestRecord('http://www.google.com/evil.bin') + assert rec in Selector("r.uri.filename in ['evil.bin', 'foo.bar']") + + +def test_selector_typed(): + TestRecord = RecordDescriptor("test/uri_typed", [ + ("uri", "urifield1"), + ("uri", "urifield2"), + ("string", "stringfield"), + ]) + rec = TestRecord('helloworld.exe', 'another.bin', 'Fox-IT') + assert rec in Selector("Type.uri.filename == 'helloworld.exe'") + assert rec in CompiledSelector("Type.uri.filename == 'helloworld.exe'") + assert rec in Selector("Type.uri.filename != 'howdyworld.exe'") + assert rec in CompiledSelector("Type.uri.filename != 'howdyworld.exe'") + assert rec in Selector("'another' in Type.uri.filename") + assert rec in CompiledSelector("'another' in Type.uri.filename") + assert rec in Selector("field_contains(r, Type.uri.filename, ['hello'])") + assert rec in CompiledSelector("field_contains(r, Type.uri.filename, ['hello'])") + assert rec in Selector("field_equals(r, Type.uri.filename, ['another.bin'])") + assert rec in CompiledSelector("field_equals(r, Type.uri.filename, ['another.bin'])") + assert rec in Selector(r"field_regex(r, Type.uri.filename, r'hello\w{5}.exe')") + assert rec in CompiledSelector(r"field_regex(r, Type.uri.filename, r'hello\w{5}.exe')") + + # Test TypeMatcher reuse + assert rec in Selector("Type.uri.filename == 'helloworld.exe' or Type.uri.filename == 'another.bin'") + assert rec in CompiledSelector("Type.uri.filename == 'helloworld.exe' or Type.uri.filename == 'another.bin'") + + assert rec in Selector("Type.string == 'Fox-IT'") + assert rec in CompiledSelector("Type.string == 'Fox-IT'") + assert rec in Selector("field_equals(r, Type.string, ['Fox-IT'])") + assert rec in CompiledSelector("field_equals(r, Type.string, ['Fox-IT'])") + assert rec in Selector("field_contains(r, Type.string, ['Fox'])") + assert rec in CompiledSelector("field_contains(r, Type.string, ['Fox'])") + assert rec in Selector(r"field_regex(r, Type.string, r'Fox-\w{2}')") + assert rec in CompiledSelector(r"field_regex(r, Type.string, r'Fox-\w{2}')") + + assert rec not in Selector("Type.filename == 'lalala'") + assert rec not in CompiledSelector("Type.filename == 'lalala'") + assert rec not in Selector("Type.uri.filename == 'lalala'") + assert rec not in CompiledSelector("Type.uri.filename == 'lalala'") + assert rec not in Selector("field_contains(r, Type.uri.filename, ['nope'])") + assert rec not in CompiledSelector("field_contains(r, Type.uri.filename, ['nope'])") + assert rec not in Selector("field_equals(r, Type.uri.filename, ['nope'])") + assert rec not in CompiledSelector("field_equals(r, Type.uri.filename, ['nope'])") + assert rec not in Selector("field_regex(r, Type.uri.filename, 'nope')") + assert rec not in CompiledSelector("field_regex(r, Type.uri.filename, 'nope')") + + TestNamespaceRecord = RecordDescriptor("test/ip", [ + ("net.ipv4.Address", "ip"), + ]) + rec = TestNamespaceRecord('192.168.10.1') + + # This will only work in "normal" selectors, because we need to override the behaviour + # of the __contains__ operator to unwrap the requested values + assert rec in Selector("Type.net.ipv4.Address in net.ipv4.Subnet('192.168.10.1/32')") + assert rec in Selector("Type.net.ipv4.Address in net.ipv4.Subnet('192.168.10.0/24')") + assert rec in Selector("Type.net.ipv4.Address in net.ipv4.Subnet('192.168.0.0/16')") + assert rec in Selector("Type.net.ipv4.Address in net.ipv4.Subnet('192.0.0.0/8')") + assert rec in Selector("Type.net.ipv4.Address in net.ipv4.Subnet('192.168.10.1')") + assert rec in Selector("Type.net.ipv4.Address not in net.ipv4.Subnet('10.0.0.0/8')") + + with pytest.raises(InvalidOperation): + assert rec in Selector("Type.uri.filename.__class__ == 'invalid'") + + +def test_selector_unicode(): + TestRecord = RecordDescriptor("test/string", [ + ("string", "name"), + ]) + rec = TestRecord("Jack O'Neill") + assert rec not in Selector("field_contains(r, ['name'], [u'Jack O\u2019Neill'])") + + rec = TestRecord(u"jack o\u2019neill") + assert rec in Selector("field_contains(r, ['name'], [u'Jack O\u2019Neill'])") + + +def test_record_in_records(): + RecordA = RecordDescriptor("test/record_a", [ + ("datetime", "some_dt"), + ("string", "field"), + ]) + RecordB = RecordDescriptor("test/record_b", [ + ("record", "record"), + ("datetime", "some_dt"), + ]) + RecordC = RecordDescriptor("test/record_c", [ + ("record[]", "records"), + ]) + RecordD = RecordDescriptor("test/record_d", [ + ("string[]", "stringlist"), + ]) + + test_str = "this is a test" + dt = datetime.utcnow() + record_a = RecordA( + some_dt=dt, + field=test_str) + record_b = RecordB( + record=record_a, + some_dt=dt) + + subrecords = [] + record_d = None + for i in range(10): + record_d = RecordD( + stringlist=["aap", "noot", "mies", "Subrecord {}".format(i)]) + subrecords.append(record_d) + + subrecords.append(record_a) + record_c = RecordC( + records=subrecords) + + subrecords.append(None) + record_c_with_none_values = RecordC( + records=subrecords) + + assert record_b in Selector("r.record.field == '{}'".format(test_str)) + assert record_b in Selector("Type.string == '{}'".format(test_str)) + assert record_c in Selector("Type.string == '{}'".format(test_str)) + assert record_d in Selector("any(s == 'Subrecord 9' for s in r.stringlist)") + assert record_c in Selector("any(s == 'Subrecord 9' for e in r.records for s in e.stringlist)") + assert record_c_with_none_values in Selector( + "any(s == 'Subrecord 9' for e in r.records for s in e.stringlist)") + assert record_d not in Selector("any(s == 'Subrecord 9' for s in r.nonexistingfield)") + + +@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector]) +def test_non_existing_field(PSelector): + TestRecord = RecordDescriptor("test/record", [ + ("string", "query"), + ("string", "url"), + ]) + + assert TestRecord("foo", "bar") not in PSelector("r.query and r.non_existing_field") + assert TestRecord("foo", "bar") in PSelector("not r.non_existing_field") + assert TestRecord("foo", "bar") in PSelector("r.query and r.url and not r.non_existing_field") + + +@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector]) +def test_selector_modulo(PSelector): + TestRecord = RecordDescriptor("test/record", [ + ("varint", "counter"), + ]) + + records = [] + for i in range(300): + records.append(TestRecord(i)) + + selected = [rec for rec in records if rec in PSelector("r.counter % 10 == 0")] + assert len(selected) == 30 + + for rec in records: + sel = PSelector("r.counter % 10 == 0") + if rec.counter % 10 == 0: + assert rec in sel + else: + assert rec not in sel + + +@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector]) +def test_selector_bit_and(PSelector): + TestRecord = RecordDescriptor("test/record", [ + ("varint", "counter"), + ]) + + records = [] + for i in range(300): + records.append(TestRecord(i)) + + for rec in records: + sel = PSelector("(r.counter & 0x0F) == 1") + if rec.counter & 0x0F == 1: + assert rec in sel + else: + assert rec not in sel + + +@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector]) +def test_selector_bit_or(PSelector): + TestRecord = RecordDescriptor("test/record", [ + ("varint", "counter"), + ]) + + records = [] + for i in range(300): + records.append(TestRecord(i)) + + for rec in records: + sel = PSelector("(r.counter | 0x10) == 0x11") + if rec.counter | 0x10 == 0x11: + assert rec in sel + else: + assert rec not in sel + + +@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector]) +def test_selector_modulo_non_existing_field(PSelector): + TestRecord = RecordDescriptor("test/record", [ + ("varint", "counter"), + ]) + + records = [] + for i in range(300): + records.append(TestRecord(i)) + + sel = PSelector("r.counter % 10 == 0") + for rec in records: + if rec.counter % 10 == 0: + assert rec in sel + else: + assert rec not in sel + + # Test with non existing fields + # using has_field() ensures that this works with CompiledSelector and Selector + sel = PSelector("has_field(r, 'counterz') and r.counterz % 10 == 0") + for rec in records: + if hasattr(rec, "counterz") and rec.counterz % 10 == 0: + assert rec in sel + else: + assert rec not in sel + + # non existing field but without the precheck (this does not work with CompiledSelector) + if isinstance(PSelector, Selector): + sel = PSelector("r.counterz % 10 == 0") + for rec in records: + assert rec not in sel + + +if __name__ == "__main__": + __import__("standalone_test").main(globals()) diff --git a/tests/test_splunk_adapter.py b/tests/test_splunk_adapter.py new file mode 100644 index 0000000..38c910b --- /dev/null +++ b/tests/test_splunk_adapter.py @@ -0,0 +1,112 @@ +from unittest import mock + +from flow.record import RecordDescriptor +import flow.record.adapter.splunk +from flow.record.adapter.splunk import splunkify + + +def test_splunkify_reserved_field(): + + with mock.patch.object( + flow.record.adapter.splunk, + "RESERVED_SPLUNK_FIELDS", + set(["foo"]) + ): + test_record_descriptor = RecordDescriptor( + "test/record", + [("string", "foo")] + ) + + test_record = test_record_descriptor(foo="bar") + + output = splunkify(test_record) + assert output == 'type="test/record" rdtag=None rd_foo="bar"' + + +def test_splunkify_normal_field(): + + with mock.patch.object( + flow.record.adapter.splunk, + "RESERVED_SPLUNK_FIELDS", + set() + ): + test_record_descriptor = RecordDescriptor( + "test/record", + [("string", "foo")] + ) + + test_record = test_record_descriptor(foo="bar") + + output = splunkify(test_record) + assert output == 'type="test/record" rdtag=None foo="bar"' + + +def test_splunkify_rdtag_field(): + + with mock.patch.object( + flow.record.adapter.splunk, + "RESERVED_SPLUNK_FIELDS", + set() + ): + test_record_descriptor = RecordDescriptor( + "test/record", + ) + + test_record = test_record_descriptor() + + output = splunkify(test_record, tag="bar") + assert output == 'type="test/record" rdtag="bar"' + + +def test_splunkify_none_field(): + + with mock.patch.object( + flow.record.adapter.splunk, + "RESERVED_SPLUNK_FIELDS", + set() + ): + test_record_descriptor = RecordDescriptor( + "test/record", + [("string", "foo")] + ) + + test_record = test_record_descriptor() + + output = splunkify(test_record) + assert output == 'type="test/record" rdtag=None foo=None' + + +def test_splunkify_byte_field(): + + with mock.patch.object( + flow.record.adapter.splunk, + "RESERVED_SPLUNK_FIELDS", + set() + ): + test_record_descriptor = RecordDescriptor( + "test/record", + [("bytes", "foo")] + ) + + test_record = test_record_descriptor(foo=b"bar") + + output = splunkify(test_record) + assert output == 'type="test/record" rdtag=None foo="YmFy"' + + +def test_splunkify_backslash_quote_field(): + + with mock.patch.object( + flow.record.adapter.splunk, + "RESERVED_SPLUNK_FIELDS", + set() + ): + test_record_descriptor = RecordDescriptor( + "test/record", + [("string", "foo")] + ) + + test_record = test_record_descriptor(foo=b"\\\"") + + output = splunkify(test_record) + assert output == 'type="test/record" rdtag=None foo="\\\\\\""' diff --git a/tests/utils_inspect.py b/tests/utils_inspect.py new file mode 100644 index 0000000..4427491 --- /dev/null +++ b/tests/utils_inspect.py @@ -0,0 +1,58 @@ +""" +Backport of `inspect.signature` for Python 2. + +Based on: https://github.com/python/cpython/blob/3.7/Lib/inspect.py +""" + +import inspect +import collections + + +class _empty: + pass + + +class Parameter: + POSITIONAL_ONLY = 0 + POSITIONAL_OR_KEYWORD = 1 + VAR_POSITIONAL = 2 + KEYWORD_ONLY = 3 + VAR_KEYWORD = 4 + + empty = _empty + + def __init__(self, name, kind, default=_empty): + self.name = name + self.kind = kind + self.default = default + + +class Signature: + empty = _empty + + def __init__(self, parameters=None): + self.parameters = parameters + + +def signature(obj): + try: + # Python 3 + return inspect.signature(obj) + except AttributeError: + # Python 2 + spec = inspect.getargspec(obj) + + # Create parameter objects which are compatible with python 3 objects + parameters = collections.OrderedDict() + for i in range(0, len(spec.args)): + arg = spec.args[i] + default = _empty + if spec.defaults and (len(spec.args) - i <= len(spec.defaults)): + default = spec.defaults[i - len(spec.args)] + parameters[arg] = Parameter(name=arg, default=default, kind=Parameter.POSITIONAL_OR_KEYWORD) + if spec.varargs: + parameters[spec.varargs] = Parameter(name=spec.varargs, kind=Parameter.VAR_POSITIONAL) + if spec.keywords: + parameters[spec.keywords] = Parameter(name=spec.keywords, kind=Parameter.VAR_KEYWORD) + + return Signature(parameters=parameters) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..7293d76 --- /dev/null +++ b/tox.ini @@ -0,0 +1,58 @@ +[tox] +envlist = lint, py3, pypy3 +# This version of tox will autoprovision itself and the requirements defined in +# requires if they are not available on the host system. +minversion = 3.8.0 +# This version of virtualenv installs a pip version of at least 19.0.1 in its +# venvs. +# Requiring minimally this version of virtualenv to be available prevents the +# need of having to explicitly specify a pip>=19.0 dependency in every testenv. +# pip>=19.0 is needed to ensure the sdist build by tox (which is build +# according to PEP 517 and PEP 518 by tox versions >= 3.4.0) is also installed +# properly (according to PEP 517 and PEP 518 by pip>=19.0) in the virtualenvs. +# If the dependency is not available on the host system, and the installed tox +# version is >= 3.3.0, tox will self bootstrap an environment with the proper +# versions (including the version of tox itself). +requires = virtualenv>=16.3.0 +isolated_build = true +# Putting the dist dir in the project directory instead of in the {toxworkdir}, +# makes the sdist more easily accesible and prevents the need of rebuilding it +# for the [testenv:build] target. +distdir = {toxinidir}/dist + +[testenv] +deps = + pytest + pytest-cov + coverage +commands = +# Capturing output will fail on pypy, possibly due to this issue: https://github.com/pytest-dev/pytest/issues/5502 + pytest --basetemp="{envtmpdir}" {posargs:--color=yes --capture=no --cov=flow --cov-report=term-missing -v tests} + coverage report + coverage xml + +[testenv:lint] +# Force the Python version here, so linting will be done with the correct +# Python version. There should be no difference between the CPython and pypy +# implementations, so we pick one. +basepython = python3 +deps = + flake8 +commands = + flake8 flow tests setup.py + +[testenv:build] +# Force the Python version here, so building will be done with the correct +# Python version. As the distributions are pure Python, there should be no +# difference between the CPython and pypy implementations, so we pick one. +basepython = python3 +deps = +commands = + pip wheel --no-deps -w ./dist . + +[flake8] +max-line-length = 120 +extend-ignore = + # See https://github.com/PyCQA/pycodestyle/issues/373 + E203, +statistics = True