diff --git a/.github/workflows/dissect-ci.yml b/.github/workflows/dissect-ci.yml
new file mode 100644
index 0000000..4602eeb
--- /dev/null
+++ b/.github/workflows/dissect-ci.yml
@@ -0,0 +1,7 @@
+name: Dissect CI
+on: [push, pull_request, workflow_dispatch]
+
+jobs:
+ ci:
+ uses: fox-it/dissect-workflow-templates/.github/workflows/dissect-ci-template-self-hosted.yml@main
+ secrets: inherit
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a89302b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+coverage.xml
+.coverage
+dist/
+.eggs/
+*.egg-info/
+*.pyc
+__pycache__/
+.pytest_cache/
+.tox/
+
+flow/record/version.py
diff --git a/COPYRIGHT b/COPYRIGHT
new file mode 100644
index 0000000..c055a21
--- /dev/null
+++ b/COPYRIGHT
@@ -0,0 +1,5 @@
+Dissect is released as open source by Fox-IT (https://www.fox-it.com) part of NCC Group Plc (https://www.nccgroup.com)
+
+Developed by the Dissect Team (dissect@fox-it.com) and made available at https://github.com/fox-it/flow.record
+
+License terms: AGPL3 (https://www.gnu.org/licenses/agpl-3.0.html)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..be3f7b2
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,661 @@
+ GNU AFFERO GENERAL PUBLIC LICENSE
+ Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+ A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate. Many developers of free software are heartened and
+encouraged by the resulting cooperation. However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+ The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community. It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server. Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+ An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals. This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU Affero General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Remote Network Interaction; Use with the GNU General Public License.
+
+ Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software. This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time. Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source. For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code. There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..4b4dd26
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+exclude .gitignore
+exclude .github
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ef882ba
--- /dev/null
+++ b/README.md
@@ -0,0 +1,105 @@
+# flow.record
+
+A library for defining and creating structured data (called records) that can be streamed to disk or piped to other
+tools that use `flow.record`.
+
+Records can be read and transformed to other formats by using output adapters, such as CSV and JSON.
+
+For more information on how Dissect uses this library, please see [the
+documentation](https://dissect.readthedocs.io/en/latest/tools/rdump.html#what-is-a-record).
+
+## Usage
+
+This library contains the tool `rdump`. With `rdump` you can read, write, interact, and manipulate records from `stdin`
+or from record files saved on disk. Please refer to `rdump -h` or to the [`rdump`
+documentation](https://dissect.readthedocs.io/en/latest/tools/rdump.html) for all parameters.
+
+Records are the primary output type when using the various functions of `target-query`. The following command shows how
+to pipe record output from `target-query` to `rdump`:
+
+```shell
+user@dissect~$ target-query -f runkeys targets/EXAMPLE.vmx | rdump
+
+<...>
+```
+
+## Programming example
+
+Define a `RecordDescriptor` (schema) and then create a few records and write them to disk
+
+```python
+from flow.record import RecordDescriptor, RecordWriter
+
+# define our descriptor
+MyRecord = RecordDescriptor("my/record", [
+ ("net.ipaddress", "ip"),
+ ("string", "description"),
+])
+
+# define some records
+records = [
+ MyRecord("1.1.1.1", "cloudflare dns"),
+ MyRecord("8.8.8.8", "google dns"),
+]
+
+# write the records to disk
+with RecordWriter("output.records.gz") as writer:
+ for record in records:
+ writer.write(record)
+```
+
+The records can then be read from disk using the `rdump` tool or by instantiating a `RecordReader` when using the
+library.
+
+```shell
+$ rdump output.records.gz
+
+
+```
+
+### Selectors
+
+We can also use `selectors` for filtering and selecting records using a query (Python like syntax), e.g.:
+
+```shell
+$ rdump output.records.gz -s '"google" in r.description'
+
+
+$ rdump output.records.gz -s 'r.ip in net.ipnetwork("1.1.0.0/16")'
+
+```
+
+## Build and test instructions
+
+This project uses `tox` to build source and wheel distributions. Run the following command from the root folder to build
+these:
+
+```bash
+tox -e build
+```
+
+The build artifacts can be found in the `dist/` directory.
+
+`tox` is also used to run linting and unit tests in a self-contained environment. To run both linting and unit tests
+using the default installed Python version, run:
+
+```bash
+tox
+```
+
+For a more elaborate explanation on how to build and test the project, please see [the
+documentation](https://dissect.readthedocs.io/en/latest/contributing/developing.html#building-testing).
+
+## Contributing
+
+The Dissect project encourages any contribution to the codebase. To make your contribution fit into the project, please
+refer to [the style guide](https://dissect.readthedocs.io/en/latest/contributing/style-guide.html).
+
+## Copyright and license
+
+Dissect is released as open source by Fox-IT () part of NCC Group Plc
+().
+
+Developed by the Dissect Team () and made available at .
+
+License terms: AGPL3 (). For more information, see the LICENSE file.
diff --git a/examples/filesystem.py b/examples/filesystem.py
new file mode 100644
index 0000000..a8f5524
--- /dev/null
+++ b/examples/filesystem.py
@@ -0,0 +1,108 @@
+import os
+import stat
+
+from datetime import datetime
+
+from flow.record import RecordDescriptor, RecordWriter
+
+FilesystemFile = RecordDescriptor("""
+filesystem/unix/entry
+ string path;
+ varint inode;
+ varint dev;
+ unix_file_mode mode;
+ filesize size;
+ uint32 uid;
+ uint32 gid;
+ datetime ctime;
+ datetime mtime;
+ datetime atime;
+ string link;
+""")
+
+
+def hash_file(path, t):
+ f = open(path, "rb")
+ while 1:
+ d = f.read(4096)
+ if d == "":
+ break
+ f.close()
+
+
+class FilesystemIterator:
+ basepath = None
+
+ def __init__(self, basepath):
+ self.basepath = basepath
+ self.recordType = FilesystemFile
+
+ def classify(self, source, classification):
+ self.recordType = FilesystemFile.base(_source=source, _classification=classification)
+
+ def iter(self, path):
+ path = os.path.abspath(path)
+ return self._iter(path)
+
+ def _iter(self, path):
+ if path.startswith("/proc"):
+ return
+
+ st = os.lstat(path)
+
+ abspath = path
+ if self.basepath and abspath.startswith(self.basepath):
+ abspath = abspath[len(self.basepath):]
+
+ ifmt = stat.S_IFMT(st.st_mode)
+
+ link = None
+ if ifmt == stat.S_IFLNK:
+ link = os.readlink(path)
+
+ yield self.recordType(
+ path=abspath,
+ inode=int(st.st_ino),
+ dev=int(st.st_dev),
+ mode=st.st_mode,
+ size=st.st_size,
+ uid=st.st_uid,
+ gid=st.st_gid,
+ ctime=datetime.fromtimestamp(st.st_ctime),
+ mtime=datetime.fromtimestamp(st.st_mtime),
+ atime=datetime.fromtimestamp(st.st_atime),
+ link=link,
+ )
+
+ if ifmt == stat.S_IFDIR:
+ for i in os.listdir(path):
+ if i in (".", ".."):
+ continue
+
+ fullpath = os.path.join(path, i)
+ for e in self.iter(fullpath):
+ yield e
+
+chunk = []
+
+
+if __name__ == "__main__":
+ import argparse
+ parser = argparse.ArgumentParser()
+ parser.add_argument('target', metavar="TARGET", nargs="*")
+ parser.add_argument('-s', dest='source', help="Source")
+ parser.add_argument('-c', dest='classification', help="Classification")
+ parser.add_argument('-b', dest='base', help="Base directory")
+
+ args = parser.parse_args()
+
+ stream = RecordWriter()
+
+ fsiter = FilesystemIterator(args.base)
+
+ if args.source or args.classification:
+ fsiter.classify(args.source, args.classification)
+
+ for path in args.target:
+ for r in fsiter.iter(path):
+ stream.write(r)
diff --git a/examples/passivedns.py b/examples/passivedns.py
new file mode 100644
index 0000000..be05359
--- /dev/null
+++ b/examples/passivedns.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env pypy
+import record
+import sys
+import datetime
+
+import net.ipv4
+
+from fileprocessing import DirectoryProcessor
+
+
+def ts(s):
+ return datetime.datetime.fromtimestamp(float(s))
+
+
+def ip(s):
+ return net.ipv4.Address(s)
+
+
+class SeparatedFile:
+ fp = None
+ seperator = None
+ format = None
+
+ def __init__(self, fp, seperator, format):
+ self.fp = fp
+ self.seperator = seperator
+ self.format = format
+
+ def __iter__(self):
+ desc = record.RecordDescriptor([i[0] for i in PASSIVEDNS_FORMAT])
+ recordtype = desc.recordType
+
+ for l in self.fp:
+ p = l.strip().split(self.seperator)
+
+ r = {}
+ for i in range(len(self.format)):
+ field = self.format[i]
+
+ v = p[i]
+ if field[1]:
+ v = field[1](v)
+
+ r[field[0]] = v
+
+ yield recordtype(**r)
+
+
+def PassiveDnsFile(fp):
+ return SeparatedFile(fp, "||", PASSIVEDNS_FORMAT)
+
+PASSIVEDNS_FORMAT = [
+ ("ts", ts),
+ ("src", ip),
+ ("dst", ip),
+ ("family", None),
+ ("query", None),
+ ("query_type", None),
+ ("result", None),
+ ("ttl", int),
+ ("x", None),
+]
+
+
+def main():
+ rs = record.RecordOutput(sys.stdout)
+ for r in DirectoryProcessor(sys.argv[1], PassiveDnsFile, r"\.log\.gz"):
+ rs.write(r)
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/records.json b/examples/records.json
new file mode 100644
index 0000000..30a415a
--- /dev/null
+++ b/examples/records.json
@@ -0,0 +1,2 @@
+{"_type": "recorddescriptor", "_data": ["text/paste", [["string", "key"], ["datetime", "date"], ["datetime", "expire_date"], ["wstring", "title"], ["wstring", "content"], ["wstring", "user"], ["wstring", "syntax"]]]}
+{"_classification": "PUBLIC", "_generated": "2019-03-19T09:11:04.706581", "_source": "external/pastebin", "_type": "record", "_recorddescriptor": ["text/paste", 831446724], "_version": 1, "content": "This is the content of a sampe pastebin record", "date": "2019-03-19T09:09:47", "expire_date": "1970-01-01T00:00:00", "key": "Q42eWSaF", "syntax": "text", "title": "A sample pastebin record", "user": ""}
diff --git a/examples/tcpconn.py b/examples/tcpconn.py
new file mode 100644
index 0000000..46fa7c4
--- /dev/null
+++ b/examples/tcpconn.py
@@ -0,0 +1,43 @@
+import random
+
+from datetime import datetime
+from flow import record
+
+conn = record.RecordDescriptor("""
+network/traffic/tcp/connection
+ datetime ts;
+ net.ipv4.Address src;
+ net.tcp.Port srcport;
+ net.ipv4.Address dst;
+ net.tcp.Port dstport;
+""")
+
+ip_list = [
+ "127.0.0.1",
+ "1.2.3.4",
+ "212.33.1.45",
+ "4.4.4.4",
+ "8.8.8.8",
+ "212.1.6.1",
+]
+
+port_list = [
+ 22,
+ 53,
+ 80,
+ 443,
+ 5555
+]
+
+rs = record.RecordWriter()
+
+for i in range(500):
+ r = conn(
+ ts=datetime.now(),
+ src=random.choice(ip_list),
+ srcport=random.choice(port_list),
+ dst=random.choice(ip_list),
+ dstport=random.choice(port_list)
+ )
+
+ rs.write(r)
diff --git a/flow/record/__init__.py b/flow/record/__init__.py
new file mode 100644
index 0000000..1d29015
--- /dev/null
+++ b/flow/record/__init__.py
@@ -0,0 +1,79 @@
+import os
+
+import gzip
+
+from flow.record.base import (
+ RECORD_VERSION,
+ FieldType,
+ Record,
+ GroupedRecord,
+ RecordDescriptor,
+ RecordAdapter,
+ RecordField,
+ RecordReader,
+ RecordWriter,
+ open_path,
+ stream,
+ extend_record,
+ dynamic_fieldtype,
+ DynamicDescriptor,
+ RecordDescriptorError,
+)
+from flow.record.jsonpacker import JsonRecordPacker
+from flow.record.stream import (
+ RecordOutput,
+ RecordPrinter,
+ RecordPacker,
+ RecordStreamWriter,
+ RecordStreamReader,
+ PathTemplateWriter,
+ RecordArchiver,
+ record_stream,
+)
+
+__all__ = [
+ 'RECORD_VERSION', 'FieldType', 'Record', 'GroupedRecord',
+ 'RecordDescriptor', 'RecordAdapter', 'RecordField', 'RecordReader',
+ 'RecordWriter', 'RecordOutput', 'RecordPrinter', 'RecordPacker',
+ 'JsonRecordPacker', 'RecordStreamWriter', 'RecordStreamReader',
+ 'open_path', 'stream', 'dynamic_fieldtype', 'DynamicDescriptor',
+ 'PathTemplateWriter', 'RecordArchiver', 'RecordDescriptorError',
+ 'record_stream', 'extend_record',
+]
+
+
+class View:
+ fields = None
+
+ def __init__(self, fields):
+ self.fields = fields
+
+ def __iter__(self, fields):
+ pass
+
+
+class RecordDateSplitter:
+ basepath = None
+ out = None
+
+ def __init__(self, basepath):
+ self.basepath = basepath
+ self.out = {}
+
+ def getstream(self, t):
+ if t not in self.out:
+ path = os.path.join(self.basepath, "-".join(["{:2d}".format(v) for v in t]) + ".rec.gz")
+ f = gzip.GzipFile(path, "wb")
+ rs = RecordStreamWriter(f)
+ self.out[t] = rs
+ return self.out[t]
+
+ def write(self, r):
+ t = (r.ts.year, r.ts.month, r.ts.day)
+ rs = self.getstream(t)
+ rs.write(r)
+ rs.fp.flush()
+
+ def close(self):
+ for rs in self.out.values():
+ rs.close()
diff --git a/flow/record/adapter/__init__.py b/flow/record/adapter/__init__.py
new file mode 100644
index 0000000..f244376
--- /dev/null
+++ b/flow/record/adapter/__init__.py
@@ -0,0 +1,64 @@
+__path__ = __import__('pkgutil').extend_path(__path__, __name__) # make this namespace extensible from other packages
+import abc
+
+
+def with_metaclass(meta, *bases):
+ """Create a base class with a metaclass. Python 2 and 3 compatible."""
+ # This requires a bit of explanation: the basic idea is to make a dummy
+ # metaclass for one level of class instantiation that replaces itself with
+ # the actual metaclass.
+ class metaclass(type):
+
+ def __new__(cls, name, this_bases, d):
+ return meta(name, bases, d)
+
+ @classmethod
+ def __prepare__(cls, name, this_bases):
+ return meta.__prepare__(name, bases)
+ return type.__new__(metaclass, 'temporary_class', (), {})
+
+
+class AbstractWriter(with_metaclass(abc.ABCMeta, object)):
+
+ @abc.abstractmethod
+ def write(self, rec):
+ """Write a record."""
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def flush(self):
+ """Flush any buffered writes."""
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def close(self):
+ """Close the Writer, no more writes will be possible."""
+ raise NotImplementedError
+
+ def __del__(self):
+ self.close()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *args):
+ self.flush()
+ self.close()
+
+
+class AbstractReader(with_metaclass(abc.ABCMeta, object)):
+
+ @abc.abstractmethod
+ def __iter__(self):
+ """Return a record iterator."""
+ raise NotImplementedError
+
+ def close(self):
+ """Close the Reader, can be overriden to properly free resources."""
+ pass
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *args):
+ self.close()
diff --git a/flow/record/adapter/archive.py b/flow/record/adapter/archive.py
new file mode 100644
index 0000000..0086282
--- /dev/null
+++ b/flow/record/adapter/archive.py
@@ -0,0 +1,32 @@
+from flow.record.adapter import AbstractReader, AbstractWriter
+from flow.record.stream import RecordArchiver
+
+
+class ArchiveWriter(AbstractWriter):
+ writer = None
+
+ def __init__(self, path, **kwargs):
+ self.path = path
+
+ path_template = kwargs.get("path_template")
+ name = kwargs.get("name")
+
+ self.writer = RecordArchiver(self.path, path_template=path_template, name=name)
+
+ def write(self, r):
+ self.writer.write(r)
+
+ def flush(self):
+ # RecordArchiver already flushes after every write
+ pass
+
+ def close(self):
+ if self.writer:
+ self.writer.close()
+ self.writer = None
+
+
+class ArchiveReader(AbstractReader):
+
+ def __init__(self, path, **kwargs):
+ raise NotImplementedError
diff --git a/flow/record/adapter/avro.py b/flow/record/adapter/avro.py
new file mode 100644
index 0000000..c870d72
--- /dev/null
+++ b/flow/record/adapter/avro.py
@@ -0,0 +1,192 @@
+import json
+from importlib.util import find_spec
+from datetime import datetime, timedelta, timezone
+
+import fastavro
+
+from flow import record
+from flow.record.utils import is_stdout
+from flow.record.selector import make_selector
+from flow.record.adapter import AbstractReader, AbstractWriter
+
+
+AVRO_TYPE_MAP = {
+ "boolean": "boolean",
+ "datetime": "long",
+ "filesize": "long",
+ "uint16": "int",
+ "uint32": "int",
+ "float": "float",
+ "string": "string",
+ "unix_file_mode": "long",
+ "varint": "long",
+ "wstring": "string",
+ "uri": "string",
+ "digest": "bytes",
+ "bytes": "bytes",
+}
+
+RECORD_TYPE_MAP = {
+ "boolean": "boolean",
+ "int": "varint",
+ "long": "varint",
+ "float": "float",
+ "string": "string",
+ "bytes": "bytes",
+}
+
+EPOCH = datetime(1970, 1, 1, tzinfo=timezone.utc)
+
+
+class AvroWriter(AbstractWriter):
+ fp = None
+ writer = None
+
+ def __init__(self, path, key=None, **kwargs):
+ self.fp = record.open_path(path, "wb")
+
+ self.desc = None
+ self.schema = None
+ self.parsed_schema = None
+ self.writer = None
+ self.codec = 'snappy' if find_spec('snappy') else 'deflate'
+
+ def write(self, r):
+ if not self.desc:
+ self.desc = r._desc
+ self.schema = descriptor_to_schema(self.desc)
+ self.parsed_schema = fastavro.parse_schema(self.schema)
+ self.writer = fastavro.write.Writer(self.fp, self.parsed_schema, codec=self.codec)
+
+ if self.desc != r._desc:
+ raise Exception("Mixed record types")
+
+ self.writer.write(r._packdict())
+
+ def flush(self):
+ if self.writer:
+ self.writer.flush()
+
+ def close(self):
+ if self.fp and not is_stdout(self.fp):
+ self.fp.close()
+ self.fp = None
+ self.writer = None
+
+
+class AvroReader(AbstractReader):
+ fp = None
+
+ def __init__(self, path, selector=None, **kwargs):
+ self.fp = record.open_path(path, "rb")
+ self.selector = make_selector(selector)
+
+ self.reader = fastavro.reader(self.fp)
+ self.schema = self.reader.schema
+ if not self.schema:
+ raise Exception("Missing Avro schema")
+
+ self.desc = schema_to_descriptor(self.schema)
+
+ # Store the fieldnames that are of type "datetime"
+ self.datetime_fields = set(
+ name
+ for name, field in self.desc.get_all_fields().items()
+ if field.typename == "datetime"
+ )
+
+ def __iter__(self):
+ for obj in self.reader:
+ # Convert timestamp-micros fields back to datetime fields
+ for field_name in self.datetime_fields:
+ value = obj.get(field_name, None)
+ if isinstance(value, (int, float)) and value > 0xffffffff:
+ obj[field_name] = EPOCH + timedelta(microseconds=value)
+
+ rec = self.desc.recordType(**obj)
+ if not self.selector or self.selector.match(rec):
+ yield rec
+
+ def close(self):
+ if self.fp:
+ self.fp.close()
+ self.fp = None
+
+
+def descriptor_to_schema(desc):
+ namespace, _, name = desc.name.rpartition("/")
+ schema = {
+ "type": "record",
+ "namespace": namespace,
+ "name": name,
+ "doc": json.dumps(desc._pack()),
+ "fields": [],
+ }
+
+ fields = []
+ for rf in desc.get_all_fields().values():
+ field_name = rf.name
+ field_type = rf.typename
+ field_schema = {
+ "name": field_name,
+ }
+
+ if field_type == "datetime":
+ field_schema["type"] = [{"type": "long", "logicalType": "timestamp-micros"}, {"type": "null"}]
+ else:
+ avro_type = AVRO_TYPE_MAP.get(field_type)
+ if not avro_type:
+ raise Exception("Unsupported Avro type: {}".format(field_type))
+
+ field_schema["type"] = [avro_type, "null"]
+
+ fields.append(field_schema)
+
+ schema["fields"] = fields
+ return schema
+
+
+def schema_to_descriptor(schema):
+ doc = schema.get("doc")
+
+ # Sketchy record descriptor detection
+ if doc and doc.startswith("[\"") and doc.endswith("]]]"):
+ name, fields = json.loads(doc)
+ else:
+ # No embedded record descriptor, attempt to generate one from the schema
+ name = "/".join([schema.get("namespace", ""), schema.get("name", "")]).replace(".", "/").strip("/")
+ fields = []
+
+ for f in schema.get("fields", []):
+ field_name = f["name"]
+ if field_name.startswith("_"):
+ continue
+
+ field_type = avro_type_to_flow_type(f["type"])
+ fields.append([field_type, field_name])
+
+ return record.RecordDescriptor(name, fields)
+
+
+def avro_type_to_flow_type(ftype):
+ ftypes = [ftype] if not isinstance(ftype, list) else ftype
+
+ # If a field can be null, it has an additional type of "null"
+ # So iterate over all the types, and break when we have a valid one
+ for t in ftypes:
+ if isinstance(t, dict):
+ if t.get("type") == "array":
+ item_type = avro_type_to_flow_type(t.get("items"))
+ return "{}[]".format(item_type)
+ else:
+ logical_type = t.get("logicalType")
+ if logical_type and "time" in logical_type or "date" in logical_type:
+ return "datetime"
+
+ if t == "null":
+ continue
+
+ if t in RECORD_TYPE_MAP:
+ return RECORD_TYPE_MAP[t]
+
+ raise TypeError("Can't map avro type to flow type: {}".format(t))
diff --git a/flow/record/adapter/broker.py b/flow/record/adapter/broker.py
new file mode 100644
index 0000000..6a2dfaf
--- /dev/null
+++ b/flow/record/adapter/broker.py
@@ -0,0 +1,47 @@
+from flow.record.adapter import AbstractWriter, AbstractReader
+from flow.broker import Publisher, Subscriber
+
+
+class BrokerWriter(AbstractWriter):
+ publisher = None
+
+ def __init__(self, uri, source=None, classification=None, **kwargs):
+ self.publisher = Publisher(uri, **kwargs)
+ self.source = source
+ self.classification = classification
+
+ def write(self, r):
+ record = r._replace(
+ _source=self.source or r._source,
+ _classification=self.classification or r._classification,
+ )
+ self.publisher.send(record)
+
+ def flush(self):
+ if self.publisher:
+ self.publisher.flush()
+
+ def close(self):
+ if self.publisher:
+ if hasattr(self.publisher, "stop"):
+ # Requires flow.broker >= 1.1.1
+ self.publisher.stop()
+ else:
+ self.publisher.wait()
+ self.publisher = None
+
+
+class BrokerReader(AbstractReader):
+ subscriber = None
+
+ def __init__(self, uri, name=None, selector=None, **kwargs):
+ self.subscriber = Subscriber(uri, **kwargs)
+ self.subscription = self.subscriber.select(name, str(selector))
+
+ def __iter__(self):
+ return iter(self.subscription)
+
+ def close(self):
+ if self.subscriber:
+ self.subscriber.stop()
+ self.subscriber = None
diff --git a/flow/record/adapter/csvfile.py b/flow/record/adapter/csvfile.py
new file mode 100644
index 0000000..cbb6622
--- /dev/null
+++ b/flow/record/adapter/csvfile.py
@@ -0,0 +1,43 @@
+from __future__ import absolute_import
+
+import sys
+from csv import DictWriter
+
+from flow.record import open_path
+from flow.record.utils import is_stdout
+from flow.record.adapter import AbstractWriter
+
+
+class CsvfileWriter(AbstractWriter):
+ fp = None
+
+ def __init__(self, path, fields=None, exclude=None, **kwargs):
+ mode = "w"
+ if sys.version_info[0] < 3:
+ mode = "wb"
+ self.fp = open_path(path, mode)
+ self.desc = None
+ self.writer = None
+ self.fields = fields
+ self.exclude = exclude
+ if isinstance(self.fields, str):
+ self.fields = self.fields.split(",")
+ if isinstance(self.exclude, str):
+ self.exclude = self.exclude.split(",")
+
+ def write(self, r):
+ rdict = r._asdict(fields=self.fields, exclude=self.exclude)
+ if not self.desc or self.desc != r._desc:
+ self.desc = r._desc
+ self.writer = DictWriter(self.fp, rdict)
+ self.writer.writeheader()
+ self.writer.writerow(rdict)
+
+ def flush(self):
+ if self.fp:
+ self.fp.flush()
+
+ def close(self):
+ if self.fp and not is_stdout(self.fp):
+ self.fp.close()
+ self.fp = None
diff --git a/flow/record/adapter/elastic.py b/flow/record/adapter/elastic.py
new file mode 100644
index 0000000..38c1b1c
--- /dev/null
+++ b/flow/record/adapter/elastic.py
@@ -0,0 +1,43 @@
+import elasticsearch
+import elasticsearch.helpers
+
+from flow.record.adapter import AbstractWriter, AbstractReader
+
+
+def index_stream(index, it):
+ for r in it:
+ d = r.dict()
+ if "Value" in d:
+ del d["Value"]
+
+ yield {
+ "_index": index,
+ "_type": "event_" + str(d["EventID"]),
+ "_source": d,
+ }
+
+
+class ElasticWriter(AbstractWriter):
+
+ def __init__(self, index, **kwargs):
+ self.index = index
+
+ self.es = elasticsearch.Elasticsearch()
+
+ # def writeblob(self, src):
+ # count = elasticsearch.helpers.bulk(es, index_stream("logtest", src))
+
+ def write(self, r):
+ self.es.index({"_index": self.index, "_type": r._desc.name, "_source": r.dict()})
+
+ def flush(self):
+ pass
+
+ def close(self):
+ pass
+
+
+class ElasticReader(AbstractReader):
+
+ def __iter__(self, r, **kwargs):
+ raise NotImplementedError()
diff --git a/flow/record/adapter/jsonfile.py b/flow/record/adapter/jsonfile.py
new file mode 100644
index 0000000..16ab985
--- /dev/null
+++ b/flow/record/adapter/jsonfile.py
@@ -0,0 +1,68 @@
+import json
+from flow import record
+from flow.record import JsonRecordPacker
+from flow.record.utils import is_stdout
+from flow.record.selector import make_selector
+from flow.record.adapter import AbstractWriter, AbstractReader
+from flow.record.fieldtypes import fieldtype_for_value
+
+
+class JsonfileWriter(AbstractWriter):
+ fp = None
+
+ def __init__(self, path, indent=None, **kwargs):
+ self.fp = record.open_path(path, "w")
+ if isinstance(indent, str):
+ indent = int(indent)
+ self.packer = JsonRecordPacker(indent=indent)
+ self.packer.on_descriptor.add_handler(self.packer_on_new_descriptor)
+
+ def packer_on_new_descriptor(self, descriptor):
+ self._write(descriptor)
+
+ def _write(self, obj):
+ record_json = self.packer.pack(obj)
+ self.fp.write(record_json + u"\n")
+
+ def write(self, r):
+ self._write(r)
+
+ def flush(self):
+ if self.fp:
+ self.fp.flush()
+
+ def close(self):
+ if self.fp and not is_stdout(self.fp):
+ self.fp.close()
+ self.fp = None
+
+
+class JsonfileReader(AbstractReader):
+ fp = None
+
+ def __init__(self, path, selector=None, **kwargs):
+ self.selector = make_selector(selector)
+ self.fp = record.open_path(path, "r")
+ self.packer = JsonRecordPacker()
+
+ def close(self):
+ if self.fp:
+ self.fp.close()
+ self.fp = None
+
+ def __iter__(self):
+ for line in self.fp:
+ obj = self.packer.unpack(line)
+ if isinstance(obj, record.Record):
+ if not self.selector or self.selector.match(obj):
+ yield obj
+ elif isinstance(obj, record.RecordDescriptor):
+ pass
+ else:
+ # fallback for plain jsonlines (non flow.record format)
+ jd = json.loads(line)
+ fields = [(fieldtype_for_value(val, "string"), key) for key, val in jd.items()]
+ desc = record.RecordDescriptor("json/record", fields)
+ obj = desc(**jd)
+ if not self.selector or self.selector.match(obj):
+ yield obj
diff --git a/flow/record/adapter/line.py b/flow/record/adapter/line.py
new file mode 100644
index 0000000..b38f906
--- /dev/null
+++ b/flow/record/adapter/line.py
@@ -0,0 +1,37 @@
+from flow.record.adapter import AbstractWriter
+from flow.record import open_path
+from flow.record.utils import is_stdout
+
+
+class LineWriter(AbstractWriter):
+ """Prints all fields and values of the Record on a separate line."""
+
+ fp = None
+
+ def __init__(self, path, fields=None, exclude=None, **kwargs):
+ self.fp = open_path(path, "wb")
+ self.count = 0
+ self.fields = fields
+ self.exclude = exclude
+ if isinstance(self.fields, str):
+ self.fields = self.fields.split(",")
+ if isinstance(self.exclude, str):
+ self.exclude = self.exclude.split(",")
+
+ def write(self, rec):
+ rdict = rec._asdict(fields=self.fields, exclude=self.exclude)
+ self.count += 1
+ self.fp.write("--[ RECORD {} ]--\n".format(self.count).encode())
+ if rdict:
+ fmt = "{{:>{width}}} = {{}}\n".format(width=max(len(k) for k in rdict))
+ for (key, value) in rdict.items():
+ self.fp.write(fmt.format(key, value).encode())
+
+ def flush(self):
+ if self.fp:
+ self.fp.flush()
+
+ def close(self):
+ if self.fp and not is_stdout(self.fp):
+ self.fp.close()
+ self.fp = None
diff --git a/flow/record/adapter/mongo.py b/flow/record/adapter/mongo.py
new file mode 100644
index 0000000..69c34c5
--- /dev/null
+++ b/flow/record/adapter/mongo.py
@@ -0,0 +1,91 @@
+import bson
+from flow import record
+from flow.record.adapter import AbstractReader, AbstractWriter
+from flow.record.selector import make_selector
+from pymongo import MongoClient
+
+
+def parse_path(path):
+ elements = path.strip("/").split("/", 2) # max 3 elements
+ if len(elements) == 2:
+ return "localhost", elements[0], elements[1]
+ if len(elements) == 3:
+ return tuple(elements)
+ raise ValueError("Invalid mongo path")
+
+
+class MongoWriter(AbstractWriter):
+ client = None
+
+ def __init__(self, path, key=None, **kwargs):
+ dbhost, dbname, collection = parse_path(path)
+
+ self.key = key
+ self.client = MongoClient(host=dbhost)
+ self.db = self.client[dbname]
+ self.collection = self.db[collection]
+ self.coll_descriptors = self.db["_descriptors"]
+ self.descriptors = {}
+
+ def write(self, r):
+ d = r._packdict()
+ d["_type"] = r._desc.identifier
+
+ if r._desc.identifier not in self.descriptors:
+ self.coll_descriptors.find_and_modify(
+ {"name": r._desc.identifier},
+ {"name": r._desc.identifier, "descriptor": r._desc._pack()},
+ upsert=True)
+
+ if self.key:
+ # i = self.collection.replace({self.key: d[self.key]}, d) # PyMongo3
+ self.collection.find_and_modify({self.key: d[self.key]}, d, upsert=True) # PyMongo2
+ else:
+ self.collection.insert(d)
+
+ def flush(self):
+ pass
+
+ def close(self):
+ if self.client:
+ self.client.close()
+ self.client = None
+
+
+class MongoReader(AbstractReader):
+ client = None
+
+ def __init__(self, path, selector=None, **kwargs):
+ dbhost, dbname, collection = parse_path(path)
+
+ self.selector = make_selector(selector)
+ self.client = MongoClient(host=dbhost)
+ self.db = self.client[dbname]
+ self.collection = self.db[collection]
+ self.coll_descriptors = self.db["_descriptors"]
+ self.descriptors = {}
+
+ def close(self):
+ if self.client:
+ self.client.close()
+ self.client = None
+
+ def __iter__(self):
+ desc = None
+ for r in self.collection.find():
+ if r["_type"] not in self.descriptors:
+ packed_desc = self.coll_descriptors.find({"name": r["_type"]})[0]["descriptor"]
+ self.descriptors[r["_type"]] = record.RecordDescriptor(*packed_desc)
+
+ desc = self.descriptors[r["_type"]]
+
+ del r["_id"]
+ del r["_type"]
+
+ for k in list(r.keys()):
+ if isinstance(r[k], bson.int64.Int64):
+ r[k] = int(r[k])
+
+ obj = desc(**r)
+ if not self.selector or self.selector.match(obj):
+ yield obj
diff --git a/flow/record/adapter/splunk.py b/flow/record/adapter/splunk.py
new file mode 100644
index 0000000..8d6c0de
--- /dev/null
+++ b/flow/record/adapter/splunk.py
@@ -0,0 +1,82 @@
+import socket
+import logging
+
+from flow.record.adapter import AbstractReader, AbstractWriter
+from flow.record.utils import to_str, to_bytes, to_base64
+
+
+log = logging.getLogger(__package__)
+
+RESERVED_SPLUNK_FIELDS = set([
+ '_indextime',
+ '_time',
+ 'index',
+ 'punct',
+ 'source',
+ 'sourcetype',
+ 'tag',
+])
+
+
+def splunkify(record, tag=None):
+ ret = []
+
+ ret.append(f'type="{record._desc.name}"')
+
+ if tag is None:
+ ret.append('rdtag=None')
+ else:
+ ret.append(f'rdtag="{tag}"')
+
+ for field in record._desc.fields:
+ val = getattr(record, field)
+ if val is None:
+ ret.append(f'{field}=None')
+ else:
+ val = to_base64(val) if isinstance(val, bytes) else to_str(val)
+ val = val.replace('\\', '\\\\').replace('"', '\\"')
+ if field in RESERVED_SPLUNK_FIELDS:
+ field = f'rd_{field}'
+ ret.append(f'{field}="{val}"')
+
+ return " ".join(ret)
+
+
+class SplunkWriter(AbstractWriter):
+ sock = None
+
+ def __init__(self, path, tag=None, **kwargs):
+ p = path.strip("/").split("/")
+ host, port = p[0].split(":")
+ port = int(port)
+
+ self.tag = tag
+ self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.SOL_TCP)
+ self.sock.connect((host, port))
+ self.descriptors = {}
+ self._warned = False
+
+ def write(self, record):
+ if not self._warned and 'rdtag' in record._desc.fields:
+ self._warned = True
+ log.warning(
+ "Record has 'rdtag' field which conflicts with the Splunk adapter -- "
+ "Splunk output will have duplicate 'rdtag' fields",
+ )
+ rec = splunkify(record, tag=self.tag)
+ data = to_bytes(rec) + b"\n"
+ self.sock.sendall(data)
+
+ def flush(self):
+ pass
+
+ def close(self):
+ if self.sock:
+ self.sock.close()
+ self.sock = None
+
+
+class SplunkReader(AbstractReader):
+
+ def __init__(self, path, selector=None, **kwargs):
+ raise NotImplementedError()
diff --git a/flow/record/adapter/stream.py b/flow/record/adapter/stream.py
new file mode 100644
index 0000000..c07ba4b
--- /dev/null
+++ b/flow/record/adapter/stream.py
@@ -0,0 +1,51 @@
+from flow import record
+from flow.record.utils import is_stdout
+from flow.record.adapter import AbstractReader, AbstractWriter
+
+
+class StreamWriter(AbstractWriter):
+ fp = None
+ stream = None
+
+ def __init__(self, path, clobber=True, **kwargs):
+ self.fp = record.open_path(path, "wb", clobber=clobber)
+ self.stream = record.RecordOutput(self.fp)
+
+ def write(self, r):
+ self.stream.write(r)
+
+ def flush(self):
+ if self.stream and hasattr(self.stream, "flush"):
+ self.stream.flush()
+ if self.fp:
+ self.fp.flush()
+
+ def close(self):
+ if self.stream:
+ self.stream.close()
+ self.stream = None
+
+ if self.fp and not is_stdout(self.fp):
+ self.fp.close()
+ self.fp = None
+
+
+class StreamReader(AbstractReader):
+ fp = None
+ stream = None
+
+ def __init__(self, path, selector=None, **kwargs):
+ self.fp = record.open_path(path, "rb")
+ self.stream = record.RecordStreamReader(self.fp, selector=selector)
+
+ def __iter__(self):
+ return iter(self.stream)
+
+ def close(self):
+ if self.stream:
+ self.stream.close()
+ self.stream = None
+
+ if self.fp:
+ self.fp.close()
+ self.fp = None
diff --git a/flow/record/adapter/text.py b/flow/record/adapter/text.py
new file mode 100644
index 0000000..1e8ce06
--- /dev/null
+++ b/flow/record/adapter/text.py
@@ -0,0 +1,50 @@
+from flow.record import open_path
+from flow.record.utils import is_stdout
+from flow.record.adapter import AbstractWriter
+
+REPLACE_LIST = [
+ (r"\r", "\r"),
+ (r"\n", "\n"),
+ (r"\t", "\t"),
+]
+
+
+class DefaultMissing(dict):
+ def __missing__(self, key):
+ return key.join("{}")
+
+
+class TextWriter(AbstractWriter):
+ """Records are printed as textual representation with repr() or using `format_spec`."""
+
+ fp = None
+
+ def __init__(self, path, flush=True, format_spec=None, **kwargs):
+ self.fp = open_path(path, "wb")
+ self.auto_flush = flush
+ self.format_spec = format_spec
+
+ # Allow some special characters in format template
+ if self.format_spec:
+ for old, new in REPLACE_LIST:
+ self.format_spec = self.format_spec.replace(old, new)
+
+ def write(self, rec):
+ if self.format_spec:
+ buf = self.format_spec.format_map(DefaultMissing(rec._asdict()))
+ else:
+ buf = repr(rec)
+ self.fp.write(buf.encode() + b"\n")
+
+ # because stdout is usually line buffered we force flush here if wanted
+ if self.auto_flush:
+ self.flush()
+
+ def flush(self):
+ if self.fp:
+ self.fp.flush()
+
+ def close(self):
+ if self.fp and not is_stdout(self.fp):
+ self.fp.close()
+ self.fp = None
diff --git a/flow/record/adapter/xlsx.py b/flow/record/adapter/xlsx.py
new file mode 100644
index 0000000..311af1d
--- /dev/null
+++ b/flow/record/adapter/xlsx.py
@@ -0,0 +1,65 @@
+import openpyxl
+
+from flow import record
+from flow.record.utils import is_stdout
+from flow.record.selector import make_selector
+from flow.record.adapter import AbstractWriter, AbstractReader
+
+
+class XlsxWriter(AbstractWriter):
+ fp = None
+ wb = None
+
+ def __init__(self, path, **kwargs):
+ self.fp = record.open_path(path, "wb")
+ self.wb = openpyxl.Workbook()
+ self.ws = self.wb.active
+ self.desc = None
+ # self.ws.title = "Records"
+
+ def write(self, r):
+ if not self.desc:
+ self.desc = r._desc
+ self.ws.append(r._desc.fields)
+
+ self.ws.append(r._asdict().values())
+
+ def flush(self):
+ if self.wb:
+ self.wb.save(self.fp)
+
+ def close(self):
+ if self.wb:
+ self.wb.close()
+ self.wb = None
+
+ if self.fp and not is_stdout(self.fp):
+ self.fp.close()
+ self.fp = None
+
+
+class XlsxReader(AbstractReader):
+ fp = None
+
+ def __init__(self, path, selector=None, **kwargs):
+ self.selector = make_selector(selector)
+ self.fp = record.open_path(path, "rb")
+ self.desc = None
+ self.wb = openpyxl.load_workbook(self.fp)
+ self.ws = self.wb.active
+
+ def close(self):
+ if self.fp:
+ self.fp.close()
+ self.fp = None
+
+ def __iter__(self):
+ desc = None
+ for row in self.ws.rows:
+ if not desc:
+ desc = record.RecordDescriptor([col.value.replace(" ", "_").lower() for col in row])
+ continue
+
+ obj = desc(*[col.value for col in row])
+ if not self.selector or self.selector.match(obj):
+ yield obj
diff --git a/flow/record/base.py b/flow/record/base.py
new file mode 100644
index 0000000..f1730c2
--- /dev/null
+++ b/flow/record/base.py
@@ -0,0 +1,807 @@
+import importlib
+import io
+import re
+import os
+import sys
+import gzip
+import struct
+import logging
+import keyword
+import hashlib
+import functools
+import collections
+try:
+ # Python 2
+ import urlparse
+except ImportError:
+ # Python 3
+ import urllib.parse as urlparse
+try:
+ import lz4.frame as lz4
+ HAS_LZ4 = True
+except ImportError:
+ HAS_LZ4 = False
+try:
+ import bz2
+ HAS_BZ2 = True
+except ImportError:
+ HAS_BZ2 = False
+try:
+ import zstandard as zstd
+ HAS_ZSTD = True
+except ImportError:
+ HAS_ZSTD = False
+
+from collections import OrderedDict
+from operator import itemgetter as _itemgetter
+from .whitelist import WHITELIST, WHITELIST_TREE
+from .utils import to_str, to_native_str
+
+log = logging.getLogger(__package__)
+
+RECORD_VERSION = 1
+RESERVED_FIELDS = OrderedDict([
+ ("_source", "string"),
+ ("_classification", "string"),
+ ("_generated", "datetime"),
+ # For compatibility reasons, always add new reserved fields BEFORE
+ # the _version field, but AFTER the second to last field
+ ("_version", "varint"),
+])
+
+# Compression Headers
+GZIP_MAGIC = b"\x1f\x8b"
+BZ2_MAGIC = b"BZh"
+LZ4_MAGIC = b"\x04\x22\x4d\x18"
+ZSTD_MAGIC = b"\x28\xb5\x2f\xfd"
+
+RE_VALID_FIELD_NAME = re.compile(r"^_?[a-zA-Z][a-zA-Z0-9_]*(?:\[\])?$")
+RE_VALID_RECORD_TYPE_NAME = re.compile("^[a-zA-Z][a-zA-Z0-9_]*(/[a-zA-Z][a-zA-Z0-9_]*)*$")
+
+RECORD_CLASS_TEMPLATE = """
+from datetime import datetime
+from itertools import zip_longest
+
+class {name}(Record):
+ _desc = desc
+ _field_types = {field_types}
+
+ __slots__ = {slots_tuple}
+
+ def __init__(__self, {args}):
+{init_code}
+
+ @classmethod
+ def _unpack(__cls, {args}):
+{unpack_code}
+"""
+
+
+class Peekable:
+ """Wrapper class for adding .peek() to a file object."""
+
+ def __init__(self, fd):
+ self.fd = fd
+ self.buffer = None
+
+ def peek(self, size):
+ if self.buffer is not None:
+ raise BufferError("Only 1 peek allowed")
+ data = self.fd.read(size)
+ self.buffer = io.BytesIO(data)
+ return data
+
+ def read(self, size=None):
+ data = b""
+ if self.buffer is None:
+ data = self.fd.read(size)
+ else:
+ data = self.buffer.read(size)
+ if len(data) < size:
+ data += self.fd.read(size - len(data))
+ self.buffer = None
+ return data
+
+ def close(self):
+ self.buffer = None
+ self.fd.close()
+ self.fd = None
+
+
+class RecordDescriptorError(Exception):
+ pass
+
+
+class FieldType:
+
+ def _typename(self):
+ t = type(self)
+ t.__module__.split(".fieldtypes.")[1] + "." + t.__name__
+
+ @classmethod
+ def default(cls):
+ """Return the default value for the field in the Record template."""
+ return None
+
+ @classmethod
+ def _unpack(cls, data):
+ return data
+
+
+class Record:
+ __slots__ = ()
+
+ def __eq__(self, other):
+ if not isinstance(other, Record):
+ return False
+ return self._pack() == other._pack()
+
+ def _pack(self, unversioned=False):
+ values = []
+ for k in self.__slots__:
+ v = getattr(self, k)
+ v = v._pack() if isinstance(v, FieldType) else v
+
+ # Skip version field if requested (only for compatibility reasons)
+ if unversioned and k == "_version" and v == 1:
+ continue
+ else:
+ values.append(v)
+
+ return self._desc.identifier, tuple(values)
+
+ def _packdict(self):
+ return dict(
+ (k, v._pack() if isinstance(v, FieldType) else v)
+ for k, v in ((k, getattr(self, k)) for k in self.__slots__))
+
+ def _asdict(self, fields=None, exclude=None):
+ exclude = exclude or []
+ if fields:
+ return OrderedDict((k, getattr(self, k)) for k in fields if k in self.__slots__ and k not in exclude)
+ return OrderedDict((k, getattr(self, k)) for k in self.__slots__ if k not in exclude)
+
+ def __setattr__(self, k, v):
+ """Enforce setting the fields to their respective types."""
+ # NOTE: This is a HOT code path
+ field_type = self._field_types.get(k)
+ if v is not None and k in self.__slots__ and field_type:
+ if not isinstance(v, field_type):
+ v = field_type(v)
+ super().__setattr__(k, v)
+
+ def _replace(self, **kwds):
+ result = self.__class__(*map(kwds.pop, self.__slots__, (getattr(self, k) for k in self.__slots__)))
+ if kwds:
+ raise ValueError('Got unexpected field names: {kwds!r}'.format(kwds=list(kwds)))
+ return result
+
+ def __repr__(self):
+ return "<{} {}>".format(
+ self._desc.name,
+ " ".join("{}={!r}".format(k, getattr(self, k)) for k in self._desc.fields))
+
+
+class GroupedRecord(Record):
+ """
+ GroupedRecord acts like a normal Record, but can contain multiple records.
+
+ See it as a flat Record view on top of multiple Records.
+ If two Records have the same fieldname, the first one will prevail.
+ """
+
+ def __init__(self, name, records):
+ super().__init__()
+ self.name = to_str(name)
+ self.records = []
+ self.descriptors = []
+ self.flat_fields = []
+
+ # to avoid recursion in __setattr__ and __getattr__
+ self.__dict__["fieldname_to_record"] = OrderedDict()
+
+ for rec in records:
+ if isinstance(rec, GroupedRecord):
+ for r in rec.records:
+ self.records.append(r)
+ self.descriptors.append(r._desc)
+ else:
+ self.records.append(rec)
+ self.descriptors.append(rec._desc)
+
+ all_fields = rec._desc.get_all_fields()
+ required_fields = rec._desc.get_required_fields()
+ for field in all_fields.values():
+ fname = field.name
+ if fname in self.fieldname_to_record:
+ continue
+ self.fieldname_to_record[fname] = rec
+ if fname not in required_fields:
+ self.flat_fields.append(field)
+ # flat descriptor to maintain compatibility with Record
+
+ self._desc = RecordDescriptor(self.name, [(f.typename, f.name) for f in self.flat_fields])
+
+ def get_record_by_type(self, type_name):
+ """
+ Get record in a GroupedRecord by type_name.
+
+ Args:
+ type_name (str): The record type name (for example wq/meta).
+
+ Returns:
+ None or the record
+
+ """
+ for record in self.records:
+ if record._desc.name == type_name:
+ return record
+ return None
+
+ def _asdict(self, fields=None, exclude=None):
+ exclude = exclude or []
+ keys = self.fieldname_to_record.keys()
+ if fields:
+ return OrderedDict((k, getattr(self, k)) for k in fields if k in keys and k not in exclude)
+ return OrderedDict((k, getattr(self, k)) for k in keys if k not in exclude)
+
+ def __repr__(self):
+ return "<{} {}>".format(self.name, self.records)
+
+ def __setattr__(self, attr, val):
+ if attr in getattr(self, "fieldname_to_record", {}):
+ x = self.fieldname_to_record.get(attr)
+ return setattr(x, attr, val)
+ return object.__setattr__(self, attr, val)
+
+ def __getattr__(self, attr):
+ x = self.__dict__.get("fieldname_to_record", {}).get(attr)
+ if x:
+ return getattr(x, attr)
+ raise AttributeError(attr)
+
+ def _pack(self):
+ return (
+ self.name,
+ tuple(record._pack() for record in self.records),
+ )
+
+ def _replace(self, **kwds):
+ new_records = []
+ for record in self.records:
+ new_records.append(
+ record.__class__(*map(kwds.pop, record.__slots__, (getattr(self, k) for k in record.__slots__)))
+ )
+ if kwds:
+ raise ValueError('Got unexpected field names: {kwds!r}'.format(kwds=list(kwds)))
+ return GroupedRecord(self.name, new_records)
+
+
+def is_valid_field_name(name, check_reserved=True):
+ if check_reserved:
+ if name in RESERVED_FIELDS:
+ return False
+ else:
+ if name in RESERVED_FIELDS:
+ return True
+
+ if name.startswith("_"):
+ return False
+
+ if not RE_VALID_FIELD_NAME.match(name):
+ return False
+
+ return True
+
+
+def parse_def(definition):
+ record_type = None
+ fields = []
+ for line in definition.split("\n"):
+ line = line.strip()
+
+ if not line:
+ continue
+
+ if not record_type:
+ record_type = line
+ else:
+ _type, name = re.split(r"\s+", line.rstrip(";"))
+
+ fields.append((_type, name))
+
+ return record_type, fields
+
+
+class RecordField:
+ name = None
+ typename = None
+ type = None
+
+ def __init__(self, name, typename):
+ if not is_valid_field_name(name, check_reserved=False):
+ raise RecordDescriptorError("Invalid field name: {}".format(name))
+
+ self.name = to_str(name)
+ self.typename = to_str(typename)
+
+ self.type = fieldtype(typename)
+
+ def __repr__(self):
+ return "".format(self.name, self.typename)
+
+
+class RecordFieldSet(list):
+ pass
+
+
+class RecordDescriptor:
+ name = None
+ fields = None
+ recordType = None
+ _desc_hash = None
+
+ def __init__(self, name, fields=None):
+ name = to_str(name)
+
+ if isinstance(fields, RecordDescriptor):
+ # Clone fields
+ fields = fields.get_field_tuples()
+ elif not fields:
+ name, fields = parse_def(name)
+
+ fields = list([(to_native_str(k), to_str(v)) for k, v in fields])
+
+ contains_keyword = False
+ for fieldtype, fieldname in fields:
+ if not is_valid_field_name(fieldname):
+ raise RecordDescriptorError("Field '{}' is an invalid or reserved field name.".format(fieldname))
+
+ # Reserved Python keywords are allowed as field names, but at a cost.
+ # When a Python keyword is used as a field name, you can't use it as a kwarg anymore
+ # You'll be forced to either use *args or a expanding a dict to kwargs to initialize a record
+ # E.g. Record('from_value', 'and_value') or Record(**{'from': 1, 'and': 2})
+ # You'll also only be able to get or set reserved attributes using getattr or setattr.
+ # Record initialization will also be slower, due to a different (slower) implementation
+ # that is compatible with this method of initializing records.
+ if keyword.iskeyword(fieldname):
+ contains_keyword = True
+
+ self.fields = OrderedDict([(n, RecordField(n, _type)) for _type, n in fields])
+ all_fields = self.get_all_fields()
+ self.name = name
+
+ if not RE_VALID_RECORD_TYPE_NAME.match(name):
+ raise RecordDescriptorError("Invalid record type name")
+
+ args = ""
+ init_code = ""
+ unpack_code = ""
+
+ if len(all_fields) >= 255 and not (sys.version_info >= (3, 7)) or contains_keyword:
+ args = "*args, **kwargs"
+ init_code = (
+ "\t\tfor k, v in zip_longest(__self.__slots__, args):\n" +
+ "\t\t\tsetattr(__self, k, kwargs.get(k, v))\n" +
+ "\t\t_generated = __self._generated\n")
+ unpack_code = (
+ "\t\tvalues = dict([(f, __cls._field_types[f]._unpack(kwargs.get(f, v)) " +
+ "if kwargs.get(f, v) is not None else None) for f, v in zip_longest(__cls.__slots__, args)])\n" +
+ "\t\treturn __cls(**values)")
+ else:
+ args = ", ".join(["{}=None".format(k) for k in all_fields])
+ unpack_code = "\t\treturn __cls(\n"
+ for field in all_fields.values():
+ if field.type.default == FieldType.default:
+ default = FieldType.default()
+ else:
+ default = "_field_{field.name}.type.default()".format(field=field)
+ init_code += "\t\t__self.{field} = {field} if {field} is not None else {default}\n".format(
+ field=field.name, default=default)
+ unpack_code += (
+ "\t\t\t{field} = _field_{field}.type._unpack({field}) " +
+ "if {field} is not None else {default},\n").format(
+ field=field.name, default=default)
+ unpack_code += "\t\t)"
+
+ init_code += "\t\t__self._generated = _generated or datetime.utcnow()\n\t\t__self._version = RECORD_VERSION"
+ # Store the fieldtypes so we can enforce them in __setattr__()
+ field_types = "{\n"
+ for field in all_fields:
+ field_types += "\t\t{field!r}: _field_{field}.type,\n".format(field=field)
+ field_types += "\t}"
+
+ code = RECORD_CLASS_TEMPLATE.format(
+ name=name.replace("/", "_"),
+ args=args,
+ slots_tuple=tuple(all_fields.keys()),
+ init_code=init_code,
+ unpack_code=unpack_code,
+ field_types=field_types,
+ )
+
+ code = code.replace("\t", " ")
+ c = compile(code, "", "exec")
+
+ data = {
+ "desc": self, "Record": Record, "OrderedDict": OrderedDict,
+ "_itemgetter": _itemgetter, "_property": property,
+ "RECORD_VERSION": RECORD_VERSION,
+ }
+ for field in all_fields.values():
+ data["_field_{}".format(field.name)] = field
+
+ exec(c, data)
+
+ self.recordType = data[name.replace("/", "_")]
+
+ self.identifier = (self.name, self.descriptor_hash)
+
+ @staticmethod
+ def get_required_fields():
+ """
+ Get required fields.
+
+ Returns:
+ OrderedDict
+
+ """
+ required_fields = OrderedDict([(k, RecordField(k, v)) for k, v in RESERVED_FIELDS.items()])
+ return required_fields
+
+ def get_all_fields(self):
+ """
+ Get all fields including required meta fields.
+
+ Returns:
+ OrderedDict
+
+ """
+ required_fields = self.get_required_fields()
+ fields = self.fields.copy()
+ fields.update(required_fields)
+ return fields
+
+ def getfields(self, typename):
+ if isinstance(typename, DynamicFieldtypeModule):
+ name = typename.gettypename()
+ else:
+ name = typename
+
+ return RecordFieldSet(field for field in self.fields.values() if field.typename == name)
+
+ def __call__(self, *args, **kwargs):
+ return self.recordType(*args, **kwargs)
+
+ def init_from_dict(self, rdict, raise_unknown=False):
+ """Create a new Record initialized with key, value pairs from `rdict`.
+
+ If `raise_unknown=True` then fields on `rdict` that are unknown to this
+ RecordDescriptor will raise a TypeError exception due to initializing
+ with unknown keyword arguments. (default: False)
+
+ Returns:
+ Record
+
+ """
+
+ if not raise_unknown:
+ rdict = {k: v for k, v in rdict.items() if k in self.recordType.__slots__}
+ return self.recordType(**rdict)
+
+ def init_from_record(self, record, raise_unknown=False):
+ """Create a new Record initialized with data from another `record`.
+
+ If `raise_unknown=True` then fields on `record` that are unknown to this
+ RecordDescriptor will raise a TypeError exception due to initializing
+ with unknown keyword arguments. (default: False)
+
+ Returns:
+ Record
+
+ """
+ return self.init_from_dict(record._asdict(), raise_unknown=raise_unknown)
+
+ def extend(self, fields):
+ """Returns a new RecordDescriptor with the extended fields
+
+ Returns:
+ RecordDescriptor
+ """
+ new_fields = list(self.get_field_tuples()) + fields
+ return RecordDescriptor(self.name, new_fields)
+
+ def get_field_tuples(self):
+ """Returns a tuple containing the (typename, name) tuples, eg:
+
+ (('boolean', 'foo'), ('string', 'bar'))
+
+ Returns:
+ tuple
+ """
+ return tuple((self.fields[f].typename, self.fields[f].name) for f in self.fields)
+
+ @staticmethod
+ @functools.lru_cache(maxsize=256)
+ def calc_descriptor_hash(name, fields):
+ """Calculate and return the (cached) descriptor hash as a 32 bit integer.
+
+ The descriptor hash is the first 4 bytes of the sha256sum of the descriptor name and field names and types.
+ """
+ h = hashlib.sha256(name.encode("utf-8"))
+ for (typename, name) in fields:
+ h.update(name.encode("utf-8"))
+ h.update(typename.encode("utf-8"))
+ return struct.unpack(">L", h.digest()[:4])[0]
+
+ @property
+ def descriptor_hash(self):
+ """Returns the (cached) descriptor hash"""
+ if not self._desc_hash:
+ self._desc_hash = self.calc_descriptor_hash(self.name, self.get_field_tuples())
+ return self._desc_hash
+
+ def __hash__(self):
+ return hash((self.name, self.get_field_tuples()))
+
+ def __eq__(self, other):
+ if isinstance(other, RecordDescriptor):
+ return self.name == other.name and self.get_field_tuples() == other.get_field_tuples()
+ return NotImplemented
+
+ def __repr__(self):
+ return "".format(self.name, self.descriptor_hash)
+
+ def definition(self, reserved=True):
+ """Return the RecordDescriptor as Python definition string.
+
+ If `reserved` is True it will also return the reserved fields.
+ """
+ fields = []
+ for ftype in self.get_all_fields().values():
+ if not reserved and ftype.name.startswith("_"):
+ continue
+ fields.append(
+ ' ("{ftype.typename}", "{ftype.name}"),'.format(ftype=ftype))
+ fields_str = "\n".join(fields)
+ return 'RecordDescriptor("{}", [\n{}\n])'.format(self.name, fields_str)
+
+ def base(self, **kwargs_sink):
+ def wrapper(**kwargs):
+ kwargs.update(kwargs_sink)
+ return self.recordType(**kwargs)
+
+ return wrapper
+
+ def _pack(self):
+ return self.name, [(field.typename, field.name) for field in self.fields.values()]
+
+ @staticmethod
+ def _unpack(name, fields):
+ return RecordDescriptor(name, fields)
+
+
+def DynamicDescriptor(name, fields):
+ return RecordDescriptor(name, [("dynamic", field) for field in fields])
+
+
+def open_path(path, mode, clobber=True):
+ """
+ Open `path` using `mode` and returns a file object.
+
+ It handles special cases if path is meant to be stdin or stdout.
+ And also supports compression based on extension or file header of stream.
+
+ Args:
+ path (str): Filename or path to filename to open
+ mode (str): Could be "r", "rb" to open file for reading, "w", "wb" for writing
+ clobber (bool): Overwrite file if it already exists if `clobber=True`, else raises IOError.
+
+ """
+ binary = "b" in mode
+ fp = None
+ if mode in ("w", "wb"):
+ out = True
+ elif mode in ("r", "rb"):
+ out = False
+ else:
+ raise ValueError("mode string can only be 'r', 'rb', 'w', or 'wb', not {!r}".format(mode))
+
+ # check for stdin or stdout
+ is_stdio = path in (None, "", "-")
+
+ # check if output path exists
+ if not is_stdio and not clobber and os.path.exists(path) and out:
+ raise IOError("Output file {!r} already exists, and clobber=False".format(path))
+
+ # check path extension for compression
+ if path:
+ if path.endswith(".gz"):
+ fp = gzip.GzipFile(path, mode)
+ elif path.endswith(".bz2"):
+ if not HAS_BZ2:
+ raise RuntimeError('bz2 python module not available')
+ fp = bz2.BZ2File(path, mode)
+ elif path.endswith(".lz4"):
+ if not HAS_LZ4:
+ raise RuntimeError('lz4 python module not available')
+ fp = lz4.open(path, mode)
+ elif path.endswith((".zstd", ".zst")):
+ if not HAS_ZSTD:
+ raise RuntimeError('zstandard python module not available')
+ if not out:
+ dctx = zstd.ZstdDecompressor()
+ fp = dctx.stream_reader(open(path, "rb"))
+ else:
+ cctx = zstd.ZstdCompressor()
+ fp = cctx.stream_writer(open(path, "wb"))
+
+ # normal file or stdio for reading or writing
+ if not fp:
+ if is_stdio:
+ if binary:
+ fp = getattr(sys.stdout, "buffer", sys.stdout) if out else getattr(sys.stdin, "buffer", sys.stdin)
+ else:
+ fp = sys.stdout if out else sys.stdin
+ else:
+ fp = io.open(path, mode)
+ # check if we are reading a compressed stream
+ if not out and binary:
+ if not hasattr(fp, "peek"):
+ fp = Peekable(fp)
+ peek_data = fp.peek(4)
+ if peek_data[:2] == GZIP_MAGIC:
+ fp = gzip.GzipFile(fileobj=fp, mode=mode)
+ elif HAS_BZ2 and peek_data[:3] == BZ2_MAGIC:
+ fp = bz2.BZ2File(fp, mode=mode)
+ elif HAS_LZ4 and peek_data[:4] == LZ4_MAGIC:
+ fp = lz4.open(fp, mode=mode)
+ elif HAS_ZSTD and peek_data[:4] == ZSTD_MAGIC:
+ dctx = zstd.ZstdDecompressor()
+ fp = dctx.stream_reader(fp)
+ return fp
+
+
+def RecordAdapter(url, out, selector=None, clobber=True):
+ url = url or ""
+ url = str(url)
+
+ # Guess adapter based on extension
+ ext_to_adapter = {
+ ".avro": "avro",
+ ".json": "jsonfile",
+ }
+ _, ext = os.path.splitext(url)
+
+ p = urlparse.urlparse(url, ext_to_adapter.get(ext, "stream"))
+
+ if '+' in p.scheme:
+ adapter, sub_adapter = p.scheme.split("+", 1)
+ else:
+ adapter = p.scheme
+ sub_adapter = None
+
+ mod = importlib.import_module("flow.record.adapter.{}".format(adapter))
+
+ clsname = ("{}Writer" if out else "{}Reader").format(adapter.title())
+
+ cls = getattr(mod, clsname)
+ arg_dict = dict(urlparse.parse_qsl(p.query))
+ cls_url = p.netloc + p.path
+ if sub_adapter:
+ cls_url = sub_adapter + "://" + cls_url
+
+ if not out and selector:
+ arg_dict["selector"] = selector
+
+ if out:
+ arg_dict["clobber"] = clobber
+
+ log.debug("Creating {!r} for {!r} with args {!r}".format(cls, url, arg_dict))
+ return cls(cls_url, **arg_dict)
+
+
+def RecordReader(url=None, selector=None):
+ return RecordAdapter(url, False, selector=selector)
+
+
+def RecordWriter(url=None, clobber=True):
+ return RecordAdapter(url, True, clobber=clobber)
+
+
+def stream(src, dst):
+ for r in src:
+ dst.write(r)
+ dst.flush()
+
+
+def fieldtype(clspath):
+ if clspath.endswith('[]'):
+ origpath = clspath
+ clspath = clspath[:-2]
+ islist = True
+ else:
+ islist = False
+
+ if clspath not in WHITELIST:
+ raise AttributeError("Invalid field type: {}".format(clspath))
+
+ p = clspath.rsplit(".", 1)
+ module_path = "flow.record.fieldtypes"
+ clsname = p.pop()
+ if p:
+ module_path += "." + p[0]
+
+ mod = importlib.import_module(module_path)
+
+ t = getattr(mod, clsname)
+
+ if not issubclass(t, FieldType):
+ raise AttributeError("Field type does not derive from FieldType")
+
+ if islist:
+ listtype = type(origpath, mod.typedlist.__bases__, dict(mod.typedlist.__dict__))
+ listtype.__type__ = t
+ t = listtype
+
+ return t
+
+
+def extend_record(record, other_records, replace=False, name=None):
+ """Extend `record` with fields and values from `other_records`.
+
+ Duplicate fields are ignored in `other_records` unless `replace=True`.
+
+ Args:
+ record (Record): Initial Record we want to extend.
+ other_records (List[Record]): List of Records we use for extending/replacing.
+ replace (bool): if `True`, it will replace existing fields and values
+ in `record` from fields and values from `other_records`. Last record always wins.
+ name (str): rename the RecordDescriptor name to `name`. Otherwise, use name from
+ initial `record`.
+ """
+ field_map = collections.OrderedDict(
+ (fname, ftype) for (ftype, fname) in record._desc.get_field_tuples()
+ )
+ record_maps = [record._asdict()]
+ for other in other_records:
+ for (ftype, fname) in other._desc.get_field_tuples():
+ if not replace and fname in field_map:
+ continue
+ field_map[fname] = ftype
+ record_maps.append(other._asdict())
+ field_tuples = [(ftype, fname) for (fname, ftype) in field_map.items()]
+ ExtendedRecord = RecordDescriptor(name or record._desc.name, field_tuples)
+ if replace:
+ record_maps = record_maps[::-1]
+ return ExtendedRecord.init_from_dict(collections.ChainMap(*record_maps))
+
+
+class DynamicFieldtypeModule:
+
+ def __init__(self, path=""):
+ self.path = path
+
+ def __getattr__(self, path):
+ path = (self.path + "." if self.path else "") + path
+
+ obj = WHITELIST_TREE
+ for p in path.split('.'):
+ if p not in obj:
+ raise AttributeError("Invalid field type: {}".format(path))
+ obj = obj[p]
+
+ return DynamicFieldtypeModule(path)
+
+ def gettypename(self):
+ if fieldtype(self.path):
+ return self.path
+
+ def __call__(self, *args, **kwargs):
+ t = fieldtype(self.path)
+
+ return t(*args, **kwargs)
+
+
+net = DynamicFieldtypeModule("net")
+dynamic_fieldtype = DynamicFieldtypeModule()
diff --git a/flow/record/fieldtypes/__init__.py b/flow/record/fieldtypes/__init__.py
new file mode 100644
index 0000000..0a8fdcc
--- /dev/null
+++ b/flow/record/fieldtypes/__init__.py
@@ -0,0 +1,491 @@
+import re
+import math
+import warnings
+
+import binascii
+from binascii import a2b_hex, b2a_hex
+from posixpath import basename, dirname
+
+from datetime import datetime as _dt, timedelta
+from flow.record.base import FieldType
+
+try:
+ import urlparse
+except ImportError:
+ import urllib.parse as urlparse
+
+RE_NORMALIZE_PATH = re.compile(r'[\\/]+')
+NATIVE_UNICODE = isinstance(u'', str)
+
+string_type = str
+varint_type = int
+bytes_type = bytes
+float_type = float
+
+
+def fieldtype_for_value(value, default="string"):
+ """Returns fieldtype name derived from the value. Returns `default` if it cannot be derived.
+
+ Args:
+ value: value to derive the fieldtype from
+
+ Returns:
+ str: the field type name or `default` if it cannot be derived
+
+ Examples:
+ >>> fieldtype_for_value("hello")
+ "string"
+ >>> fieldtype_for_value(1337)
+ "varint"
+ >>> fieldtype_for_value(object(), None)
+ None
+ """
+ if isinstance(value, bytes_type):
+ return "bytes"
+ elif isinstance(value, string_type):
+ return "string"
+ elif isinstance(value, float_type):
+ return "float"
+ elif isinstance(value, bool):
+ return "boolean"
+ elif isinstance(value, (varint_type, int)):
+ return "varint"
+ elif isinstance(value, _dt):
+ return "datetime"
+ return default
+
+
+class dynamic(FieldType):
+
+ def __new__(cls, obj):
+ if isinstance(obj, FieldType):
+ # Already a flow field type
+ return obj
+
+ elif isinstance(obj, bytes_type):
+ return bytes(obj)
+
+ elif isinstance(obj, string_type):
+ return string(obj)
+
+ elif isinstance(obj, bool):
+ # Must appear before int, because bool is a subclass of int
+ return boolean(obj)
+
+ elif isinstance(obj, (varint_type, int)):
+ return varint(obj)
+
+ elif isinstance(obj, _dt):
+ return datetime(obj)
+
+ elif isinstance(obj, (list, tuple)):
+ return stringlist(obj)
+
+ raise NotImplementedError("Unsupported type for dynamic fieldtype: {}".format(type(obj)))
+
+
+class typedlist(list, FieldType):
+
+ __type__ = None
+
+ def __init__(self, values=None):
+ if not values:
+ values = []
+ super(self.__class__, self).__init__(self._convert(values))
+
+ def _convert(self, values):
+ return [self.__type__(f) if not isinstance(f, self.__type__) else f for f in values]
+
+ def _pack(self):
+ result = []
+ for f in self:
+ if not isinstance(f, self.__type__):
+ # Dont pack records already, it's the job of RecordPacker to pack record fields.
+ # Otherwise unpacking will yield unexpected results (records that are not unpacked).
+ if self.__type__ == record:
+ r = f
+ else:
+ r = self.__type__(f)._pack()
+ result.append(r)
+ else:
+ r = f._pack()
+ result.append(r)
+ return result
+
+ @classmethod
+ def _unpack(cls, data):
+ data = map(cls.__type__._unpack, data)
+ return cls(data)
+
+ @classmethod
+ def default(cls):
+ """Override default so the field is always an empty list."""
+ return cls()
+
+
+class dictlist(list, FieldType):
+
+ def _pack(self):
+ return self
+
+
+class stringlist(list, FieldType):
+
+ def _pack(self):
+ return self
+
+
+class string(string_type, FieldType):
+
+ def __new__(cls, value):
+ if isinstance(value, bytes_type):
+ value = cls._decode(value, "utf-8")
+ if isinstance(value, bytes_type):
+ # Still bytes, so decoding failed (Python 2)
+ return bytes(value)
+ return super().__new__(cls, value)
+
+ def _pack(self):
+ return self
+
+ @classmethod
+ def _decode(cls, data, encoding):
+ """Decode a byte-string into a unicode-string.
+
+ Python 3: When `data` contains invalid unicode characters a `UnicodeDecodeError` is raised.
+ Python 2: When `data` contains invalid unicode characters the original byte-string is returned.
+ """
+ if NATIVE_UNICODE:
+ # Raises exception on decode error
+ return data.decode(encoding)
+ try:
+ return data.decode(encoding)
+ except UnicodeDecodeError:
+ # Fallback to bytes (Python 2 only)
+ preview = data[:16].encode('hex_codec') + ('..' if len(data) > 16 else '')
+ warnings.warn("Got binary data in string field (hex: {}). Compatibility is not guaranteed.".format(
+ preview), RuntimeWarning)
+ return data
+
+
+# Alias for backwards compatibility
+wstring = string
+
+
+class bytes(bytes_type, FieldType):
+ value = None
+
+ def __init__(self, value):
+ if not isinstance(value, bytes_type):
+ raise TypeError("Value not of bytes type")
+ self.value = value
+
+ def _pack(self):
+ return self.value
+
+ def __repr__(self):
+ return repr(self.value)
+
+
+class datetime(_dt, FieldType):
+
+ def __new__(cls, *args, **kwargs):
+ if len(args) == 1 and not kwargs:
+ arg = args[0]
+ if isinstance(arg, bytes_type):
+ arg = arg.decode("utf-8")
+ if isinstance(arg, string_type):
+ # I expect ISO 8601 format e.g. datetime.isformat()
+ # When the microseconds part is 0, str(datetime) will not print the microsecond part (only seconds)
+ # So we have to account for this.
+ # String constructor is used for example in JsonRecordAdapter
+ if "." in arg:
+ return cls.strptime(arg, "%Y-%m-%dT%H:%M:%S.%f")
+ else:
+ return cls.strptime(arg, "%Y-%m-%dT%H:%M:%S")
+ elif isinstance(arg, (int,)):
+ return cls.utcfromtimestamp(arg)
+ elif isinstance(arg, (_dt,)):
+ return _dt.__new__(
+ cls,
+ arg.year, arg.month, arg.day,
+ arg.hour, arg.minute, arg.second, arg.microsecond,
+ arg.tzinfo)
+
+ return _dt.__new__(cls, *args, **kwargs)
+
+ def __eq__(self, other):
+ return self - other == timedelta(0)
+
+ def _pack(self):
+ return self
+
+ def __repr__(self):
+ result = str(self)
+ return result
+
+
+class varint(varint_type, FieldType):
+
+ def _pack(self):
+ return self
+
+
+class float(float, FieldType):
+
+ def _pack(self):
+ return self
+
+
+class uint16(int, FieldType):
+
+ value = None
+
+ def __init__(self, value):
+ if value < 0 or value > 0xffff:
+ raise ValueError("Value not within (0x0, 0xffff), got: {}".format(value))
+
+ self.value = value
+
+ def _pack(self):
+ return self.value
+
+ def __repr__(self):
+ return str(self.value)
+
+
+class uint32(int, FieldType):
+ value = None
+
+ def __init__(self, value):
+ if value < 0 or value > 0xffffffff:
+ raise ValueError("Value not within (0x0, 0xffffffff), got {}".format(value))
+
+ self.value = value
+
+ def _pack(self):
+ return self.value
+
+
+class boolean(int, FieldType):
+ value = None
+
+ def __init__(self, value):
+ if value < 0 or value > 1:
+ raise ValueError("Value not a valid boolean value")
+
+ self.value = bool(value)
+
+ def _pack(self):
+ return self.value
+
+ def __str__(self):
+ return str(self.value)
+
+ def __repr__(self):
+ return str(self.value)
+
+
+def human_readable_size(x):
+ # hybrid of http://stackoverflow.com/a/10171475/2595465
+ # with http://stackoverflow.com/a/5414105/2595465
+ if x == 0:
+ return '0'
+ magnitude = int(math.log(abs(x), 10.24))
+ if magnitude > 16:
+ format_str = '%iP'
+ # denominator_mag = 15
+ else:
+ float_fmt = '%2.1f' if magnitude % 3 == 1 else '%1.2f'
+ illion = (magnitude + 1) // 3
+ format_str = float_fmt + " " + [' ', 'K', 'M', 'G', 'T', 'P'][illion]
+ return (format_str % (x * 1.0 / (1024 ** illion))) + "B"
+
+
+class filesize(varint):
+
+ def __repr__(self):
+ return human_readable_size(self)
+
+
+class unix_file_mode(varint):
+
+ def __repr__(self):
+ return oct(self).rstrip("L")
+
+
+class digest(FieldType):
+ __md5 = __md5_bin = None
+ __sha1 = __sha1_bin = None
+ __sha256 = __sha256_bin = None
+
+ def __init__(self, value=None, **kwargs):
+ if isinstance(value, (tuple, list)):
+ self.md5, self.sha1, self.sha256 = value
+ elif isinstance(value, dict):
+ self.md5 = value.get("md5", self.md5)
+ self.sha1 = value.get("sha1", self.sha1)
+ self.sha256 = value.get("sha256", self.sha256)
+
+ @classmethod
+ def default(cls):
+ """Override default so the field is always a digest() instance."""
+ return cls()
+
+ def __repr__(self):
+ return "(md5={d.md5}, sha1={d.sha1}, sha256={d.sha256})".format(d=self)
+
+ @property
+ def md5(self):
+ return self.__md5
+
+ @property
+ def sha1(self):
+ return self.__sha1
+
+ @property
+ def sha256(self):
+ return self.__sha256
+
+ @md5.setter
+ def md5(self, val):
+ if val is None:
+ self.__md5 = self.__md5_bin = None
+ return
+ try:
+ self.__md5_bin = a2b_hex(val)
+ self.__md5 = val
+ if len(self.__md5_bin) != 16:
+ raise TypeError("Incorrect hash length")
+ except binascii.Error as e:
+ raise TypeError("Invalid MD5 value {!r}, {}".format(val, e))
+
+ @sha1.setter
+ def sha1(self, val):
+ if val is None:
+ self.__sha1 = self.__sha1_bin = None
+ return
+ try:
+ self.__sha1_bin = a2b_hex(val)
+ self.__sha1 = val
+ if len(self.__sha1_bin) != 20:
+ raise TypeError("Incorrect hash length")
+ except binascii.Error as e:
+ raise TypeError("Invalid SHA-1 value {!r}, {}".format(val, e))
+
+ @sha256.setter
+ def sha256(self, val):
+ if val is None:
+ self.__sha256 = self.__sha256_bin = None
+ return
+ try:
+ self.__sha256_bin = a2b_hex(val)
+ self.__sha256 = val
+ if len(self.__sha256_bin) != 32:
+ raise TypeError("Incorrect hash length")
+ except binascii.Error as e:
+ raise TypeError("Invalid SHA-256 value {!r}, {}".format(val, e))
+
+ def _pack(self):
+ return (
+ self.__md5_bin,
+ self.__sha1_bin,
+ self.__sha256_bin,
+ )
+
+ @classmethod
+ def _unpack(cls, data):
+ value = (
+ b2a_hex(data[0]).decode() if data[0] else None,
+ b2a_hex(data[1]).decode() if data[1] else None,
+ b2a_hex(data[2]).decode() if data[2] else None,
+ )
+ return cls(value)
+
+
+class uri(string, FieldType):
+
+ def __init__(self, value):
+ self._parsed = urlparse.urlparse(value)
+
+ @staticmethod
+ def normalize(path):
+ r"""Normalize Windows paths to posix.
+
+ c:\windows\system32\cmd.exe -> c:/windows/system32/cmd.exe
+ """
+ return RE_NORMALIZE_PATH.sub('/', path)
+
+ @classmethod
+ def from_windows(cls, path):
+ """Initialize a uri instance from a windows path."""
+ return cls(uri.normalize(path))
+
+ @property
+ def scheme(self):
+ return self._parsed.scheme
+
+ @property
+ def protocol(self):
+ return self.scheme
+
+ @property
+ def netloc(self):
+ return self._parsed.netloc
+
+ @property
+ def path(self):
+ return self._parsed.path
+
+ @property
+ def params(self):
+ return self._parsed.params
+
+ @property
+ def query(self):
+ return self._parsed.query
+
+ @property
+ def args(self):
+ return self.query
+
+ @property
+ def fragment(self):
+ return self._parsed.fragment
+
+ @property
+ def username(self):
+ return self._parsed.username
+
+ @property
+ def password(self):
+ return self._parsed.password
+
+ @property
+ def hostname(self):
+ return self._parsed.hostname
+
+ @property
+ def port(self):
+ return self._parsed.port
+
+ @property
+ def filename(self):
+ return basename(self.path)
+
+ @property
+ def dirname(self):
+ return dirname(self.path)
+
+
+class record(FieldType):
+
+ def __new__(cls, record_value):
+ return record_value
+
+ def _pack(self):
+ return self.value
+
+ @classmethod
+ def _unpack(cls, data):
+ return data
diff --git a/flow/record/fieldtypes/credential.py b/flow/record/fieldtypes/credential.py
new file mode 100644
index 0000000..cc87675
--- /dev/null
+++ b/flow/record/fieldtypes/credential.py
@@ -0,0 +1,9 @@
+from flow.record.fieldtypes import string
+
+
+class username(string):
+ pass
+
+
+class password(string):
+ pass
diff --git a/flow/record/fieldtypes/net/__init__.py b/flow/record/fieldtypes/net/__init__.py
new file mode 100644
index 0000000..10e83e3
--- /dev/null
+++ b/flow/record/fieldtypes/net/__init__.py
@@ -0,0 +1,15 @@
+from flow.record.fieldtypes import string
+from .ip import ipaddress, ipnetwork, IPAddress, IPNetwork
+
+__all__ = [
+ 'ipaddress', 'ipnetwork',
+ 'IPAddress', 'IPNetwork',
+]
+
+
+class hostname(string):
+ pass
+
+
+class email(string):
+ pass
diff --git a/flow/record/fieldtypes/net/ip.py b/flow/record/fieldtypes/net/ip.py
new file mode 100644
index 0000000..b11c680
--- /dev/null
+++ b/flow/record/fieldtypes/net/ip.py
@@ -0,0 +1,80 @@
+from ipaddress import ip_address, ip_network
+from flow.record.base import FieldType
+
+
+class ipaddress(FieldType):
+ val = None
+ _type = "net.ipaddress"
+
+ def __init__(self, addr):
+ self.val = ip_address(addr)
+
+ def __eq__(self, b):
+ try:
+ return self.val == ip_address(b)
+ except ValueError:
+ return False
+
+ def __str__(self):
+ return str(self.val)
+
+ def __repr__(self):
+ return "{}({!r})".format(self._type, str(self))
+
+ def _pack(self):
+ return self.val.packed
+
+ @staticmethod
+ def _unpack(data):
+ return ipaddress(data)
+
+
+class ipnetwork(FieldType):
+ val = None
+ _type = "net.ipnetwork"
+
+ def __init__(self, addr):
+ self.val = ip_network(addr)
+
+ def __eq__(self, b):
+ try:
+ return self.val == ip_network(b)
+ except ValueError:
+ return False
+
+ @staticmethod
+ def _is_subnet_of(a, b):
+ try:
+ # Always false if one is v4 and the other is v6.
+ if a._version != b._version:
+ raise TypeError("{} and {} are not of the same version".format(a, b))
+ return (b.network_address <= a.network_address and
+ b.broadcast_address >= a.broadcast_address)
+ except AttributeError:
+ raise TypeError("Unable to test subnet containment "
+ "between {} and {}".format(a, b))
+
+ def __contains__(self, b):
+ try:
+ return self._is_subnet_of(ip_network(b), self.val)
+ except (ValueError, TypeError):
+ return False
+
+ def __str__(self):
+ return str(self.val)
+
+ def __repr__(self):
+ return "{}({!r})".format(self._type, str(self))
+
+ def _pack(self):
+ return self.val.compressed
+
+ @staticmethod
+ def _unpack(data):
+ return ipnetwork(data)
+
+
+# alias: net.IPAddress -> net.ipaddress
+# alias: net.IPNetwork -> net.ipnetwork
+IPAddress = ipaddress
+IPNetwork = ipnetwork
diff --git a/flow/record/fieldtypes/net/ipv4.py b/flow/record/fieldtypes/net/ipv4.py
new file mode 100644
index 0000000..e271b74
--- /dev/null
+++ b/flow/record/fieldtypes/net/ipv4.py
@@ -0,0 +1,137 @@
+import struct
+import socket
+
+from flow.record import FieldType
+from flow.record.utils import to_native_str
+
+
+def addr_long(s):
+ if isinstance(s, Address):
+ return s.val
+
+ if isinstance(s, int):
+ return s
+
+ return struct.unpack(">I", socket.inet_aton(s))[0]
+
+
+def addr_str(s):
+ if isinstance(s, Address):
+ return socket.inet_ntoa(struct.pack(">I", s.val))
+
+ if isinstance(s, int):
+ return socket.inet_ntoa(struct.pack(">I", s))
+
+ return s
+
+
+def mask_to_bits(n):
+ return bin(n).count("1")
+
+
+def bits_to_mask(b):
+ return (0xffffffff << (32 - b)) & 0xffffffff
+
+
+class subnet(FieldType):
+ net = None
+ mask = None
+ _type = "net.ipv4.subnet"
+
+ def __init__(self, addr, netmask=None):
+ if isinstance(addr, type(u'')):
+ addr = to_native_str(addr)
+
+ if not isinstance(addr, str):
+ raise TypeError("Subnet() argument 1 must be string, not {}".format(type(addr).__name__))
+
+ if netmask is None:
+ ip, sep, mask = addr.partition("/")
+ self.mask = bits_to_mask(int(mask)) if mask else 0xffffffff
+ self.net = addr_long(ip)
+ else:
+ self.net = addr_long(addr)
+ self.mask = bits_to_mask(netmask)
+
+ if self.net & self.mask != self.net:
+ suggest = '{}/{}'.format(addr_str(self.net & self.mask), mask_to_bits(self.mask))
+ raise ValueError("Not a valid subnet {!r}, did you mean {!r} ?".format(str(addr), suggest))
+
+ def __contains__(self, addr):
+ if addr is None:
+ return False
+
+ if isinstance(addr, type(u'')):
+ addr = to_native_str(addr)
+
+ if isinstance(addr, str):
+ addr = addr_long(addr)
+
+ if isinstance(addr, Address):
+ addr = addr.val
+
+ if isinstance(addr, int):
+ return addr & self.mask == self.net
+
+ return False
+
+ def __str__(self):
+ return "{0}/{1}".format(addr_str(self.net), mask_to_bits(self.mask))
+
+ def __repr__(self):
+ return "{}({!r})".format(self._type, str(self))
+
+
+class SubnetList:
+ subnets = None
+
+ def __init__(self):
+ self.subnets = []
+
+ def load(self, path):
+ f = open(path, "rb")
+ for line in f:
+ entry, desc = line.split(" ", 1)
+ self.subnets.append(Subnet(entry))
+
+ f.close()
+
+ def add(self, subnet):
+ self.subnets.append(Subnet(subnet))
+
+ def __contains__(self, addr):
+ if type(addr) is str:
+ addr = addr_long(addr)
+
+ return any(addr in s for s in self.subnets)
+
+
+class address(FieldType):
+ val = None
+ _type = "net.ipv4.address"
+
+ def __init__(self, addr):
+ self.val = addr_long(addr)
+
+ def __eq__(self, b):
+ return addr_long(self) == addr_long(b)
+
+ def __str__(self):
+ return addr_str(self.val)
+
+ def __repr__(self):
+ return "{}({!r})".format(self._type, str(self))
+
+ def _pack(self):
+ return self.val
+
+ @staticmethod
+ def _unpack(data):
+ return address(data)
+
+
+# Backwards compatiblity
+Address = address
+Subnet = subnet
+
+__all__ = ["address", "subnet", "Address", "Subnet", "SubnetList"]
diff --git a/flow/record/fieldtypes/net/tcp.py b/flow/record/fieldtypes/net/tcp.py
new file mode 100644
index 0000000..aa4f4d9
--- /dev/null
+++ b/flow/record/fieldtypes/net/tcp.py
@@ -0,0 +1,9 @@
+from flow.record.fieldtypes import uint16
+
+
+class port(uint16):
+ pass
+
+
+# Backwards compatiblity
+Port = port
diff --git a/flow/record/fieldtypes/net/udp.py b/flow/record/fieldtypes/net/udp.py
new file mode 100644
index 0000000..aa4f4d9
--- /dev/null
+++ b/flow/record/fieldtypes/net/udp.py
@@ -0,0 +1,9 @@
+from flow.record.fieldtypes import uint16
+
+
+class port(uint16):
+ pass
+
+
+# Backwards compatiblity
+Port = port
diff --git a/flow/record/jsonpacker.py b/flow/record/jsonpacker.py
new file mode 100644
index 0000000..ca4ae35
--- /dev/null
+++ b/flow/record/jsonpacker.py
@@ -0,0 +1,101 @@
+import json
+import base64
+import logging
+from datetime import datetime
+
+from . import fieldtypes
+from .base import Record, RecordDescriptor
+from .utils import EventHandler
+
+log = logging.getLogger(__package__)
+
+
+class JsonRecordPacker:
+
+ def __init__(self, indent=None):
+ self.descriptors = {}
+ self.on_descriptor = EventHandler()
+ self.indent = indent
+
+ def register(self, desc, notify=False):
+ if not isinstance(desc, RecordDescriptor):
+ raise Exception("Expected Record Descriptor")
+
+ # Descriptor already known
+ if desc.identifier in self.descriptors:
+ return
+
+ # versioned record descriptor
+ self.descriptors[desc.identifier] = desc
+
+ # for older non versioned records
+ self.descriptors[desc.name] = desc
+
+ if notify and self.on_descriptor:
+ log.debug("JsonRecordPacker::on_descriptor {}".format(desc))
+ self.on_descriptor(desc)
+
+ def pack_obj(self, obj):
+ if isinstance(obj, Record):
+ if obj._desc.identifier not in self.descriptors:
+ self.register(obj._desc, True)
+ serial = obj._asdict()
+ serial['_type'] = 'record'
+ serial['_recorddescriptor'] = obj._desc.identifier
+
+ # PYTHON2: Because "bytes" are also "str" we have to handle this here
+ for (field_type, field_name) in obj._desc.get_field_tuples():
+ if field_type == "bytes" and isinstance(serial[field_name], str):
+ serial[field_name] = base64.b64encode(serial[field_name]).decode()
+
+ return serial
+ if isinstance(obj, RecordDescriptor):
+ serial = {
+ '_type': 'recorddescriptor',
+ '_data': obj._pack(),
+ }
+ return serial
+ if isinstance(obj, datetime):
+ serial = obj.strftime("%Y-%m-%dT%H:%M:%S.%f")
+ return serial
+ if isinstance(obj, fieldtypes.digest):
+ return {
+ "md5": obj.md5,
+ "sha1": obj.sha1,
+ "sha256": obj.sha256,
+ }
+ if isinstance(obj, (fieldtypes.net.ipaddress, fieldtypes.net.ipnetwork)):
+ return str(obj)
+ if isinstance(obj, bytes):
+ return base64.b64encode(obj).decode()
+
+ raise Exception("Unpackable type " + str(type(obj)))
+
+ def unpack_obj(self, obj):
+ if isinstance(obj, dict):
+ _type = obj.get('_type', None)
+ if _type == "record":
+ record_descriptor_identifier = obj['_recorddescriptor']
+ record_descriptor_identifier = tuple(record_descriptor_identifier)
+ record_descriptor = self.descriptors[record_descriptor_identifier]
+ del obj['_recorddescriptor']
+ del obj['_type']
+ for (field_type, field_name) in record_descriptor.get_field_tuples():
+ if field_type == "bytes":
+ obj[field_name] = base64.b64decode(obj[field_name])
+ result = record_descriptor.recordType(**obj)
+ return result
+ if _type == "recorddescriptor":
+ data = obj['_data']
+ return RecordDescriptor._unpack(*data)
+ return obj
+
+ def pack(self, obj):
+ return json.dumps(obj, default=self.pack_obj, indent=self.indent)
+
+ def unpack(self, d):
+ record_dict = json.loads(d, object_hook=self.unpack_obj)
+ result = self.unpack_obj(record_dict)
+ if isinstance(result, RecordDescriptor):
+ self.register(result)
+ return result
diff --git a/flow/record/packer.py b/flow/record/packer.py
new file mode 100644
index 0000000..efcbf9b
--- /dev/null
+++ b/flow/record/packer.py
@@ -0,0 +1,167 @@
+import warnings
+import binascii
+import datetime
+import msgpack
+import functools
+
+from . import fieldtypes
+from .base import Record, FieldType, RecordDescriptor, GroupedRecord, RESERVED_FIELDS, RECORD_VERSION
+from .utils import EventHandler, to_str
+
+# Override defaults for msgpack packb/unpackb
+packb = functools.partial(msgpack.packb, use_bin_type=True)
+unpackb = functools.partial(msgpack.unpackb, raw=False)
+
+RECORD_PACK_EXT_TYPE = 0xe
+
+RECORD_PACK_TYPE_RECORD = 0x1
+RECORD_PACK_TYPE_DESCRIPTOR = 0x2
+RECORD_PACK_TYPE_FIELDTYPE = 0x3
+RECORD_PACK_TYPE_DATETIME = 0x10
+RECORD_PACK_TYPE_VARINT = 0x11
+RECORD_PACK_TYPE_GROUPEDRECORD = 0x12
+
+
+def identifier_to_str(identifier):
+ if isinstance(identifier, tuple) and len(identifier) == 2:
+ return (to_str(identifier[0]), identifier[1])
+ else:
+ return to_str(identifier)
+
+
+class RecordPacker:
+ EXT_TYPE = RECORD_PACK_EXT_TYPE
+ TYPES = [FieldType, Record, RecordDescriptor]
+
+ def __init__(self):
+ self.descriptors = {}
+ self.on_descriptor = EventHandler()
+
+ def register(self, desc, notify=False):
+ if not isinstance(desc, RecordDescriptor):
+ raise Exception("Expected Record Descriptor")
+
+ # versioned record descriptor
+ self.descriptors[desc.identifier] = desc
+
+ # for older non versioned records
+ self.descriptors[desc.name] = desc
+
+ if notify and self.on_descriptor:
+ self.on_descriptor(desc)
+
+ def pack_obj(self, obj, unversioned=False):
+ packed = None
+
+ if isinstance(obj, datetime.datetime):
+ t = obj.utctimetuple()[:6] + (obj.microsecond, )
+ packed = (RECORD_PACK_TYPE_DATETIME, t)
+
+ elif isinstance(obj, int):
+ neg = obj < 0
+ h = hex(abs(obj))[2:].rstrip("L")
+ if len(h) % 2 != 0:
+ h = "0" + h
+
+ packed = RECORD_PACK_TYPE_VARINT, (neg, binascii.a2b_hex(h))
+
+ elif isinstance(obj, GroupedRecord):
+ for desc in obj.descriptors:
+ if desc.identifier not in self.descriptors:
+ self.register(desc, True)
+
+ packed = RECORD_PACK_TYPE_GROUPEDRECORD, obj._pack()
+
+ elif isinstance(obj, Record):
+ if obj._desc.identifier not in self.descriptors:
+ self.register(obj._desc, True)
+
+ data = obj._pack(unversioned=unversioned)
+ packed = RECORD_PACK_TYPE_RECORD, data
+
+ elif isinstance(obj, RecordDescriptor):
+ packed = RECORD_PACK_TYPE_DESCRIPTOR, obj._pack()
+
+ if not packed:
+ raise Exception("Unpackable type " + str(type(obj)))
+
+ return msgpack.ExtType(RECORD_PACK_EXT_TYPE, self.pack(packed))
+
+ def pack(self, obj):
+ return packb(obj, default=self.pack_obj)
+
+ def unpack_obj(self, t, data):
+ if t != RECORD_PACK_EXT_TYPE:
+ raise Exception("Unknown ExtType")
+
+ subtype, value = self.unpack(data)
+
+ if subtype == RECORD_PACK_TYPE_DATETIME:
+ dt = fieldtypes.datetime(*value)
+ return dt
+
+ if subtype == RECORD_PACK_TYPE_VARINT:
+ neg, h = value
+ v = int(binascii.b2a_hex(h), 16)
+ if neg:
+ v = -v
+
+ return v
+
+ if subtype == RECORD_PACK_TYPE_RECORD:
+ identifier, values = value
+ identifier = identifier_to_str(identifier)
+ desc = self.descriptors[identifier]
+
+ # Compatibility for older records
+ # We check the actual amount of values against the expected amount of values
+ # The values received include reserved fields, so we have to add them to the
+ # fields already declared in the descriptor.
+ # The descriptor should be received from the same stream, so any inconsistency
+ # in field count should be from reserved fields.
+ version = values[-1]
+ expected_len = len(desc.fields) + len(RESERVED_FIELDS)
+
+ # Perform some basic checking on record version, if any, and issue a warning if needed.
+ if not isinstance(version, int) or version < 1 or version > 255:
+ warnings.warn(
+ ("Got old style record with no version information (expected {:d}). " +
+ "Compatibility is not guaranteed.").format(
+ RECORD_VERSION), RuntimeWarning)
+ elif version != RECORD_VERSION:
+ warnings.warn(
+ "Got other version record (expected {:d}, got {:d}). Compatibility is not guaranteed.".format(
+ RECORD_VERSION, version), RuntimeWarning)
+ # Optionally add compatibility code here later
+
+ # If the actual amount of fields is less, there's nothing we can really do.
+ # If the actual amount of fields is more, we strip additional fields but
+ # maintain the version field
+ # This implies that any record that has _more_ reserved fields always
+ # has a version field.
+ if len(values) > expected_len:
+ # Likely newer style record. Strip extra fields but maintain version field
+ values = values[:expected_len - 1]
+ values += (version,)
+
+ return desc.recordType._unpack(*values)
+
+ if subtype == RECORD_PACK_TYPE_GROUPEDRECORD:
+ name, packed_records = value
+ records = []
+ for value in packed_records:
+ identifier, values = value
+ identifier = identifier_to_str(identifier)
+ desc = self.descriptors[identifier]
+ records.append(desc.recordType._unpack(*values))
+ return GroupedRecord(name, records)
+
+ if subtype == RECORD_PACK_TYPE_DESCRIPTOR:
+ name, fields = value
+ name = to_str(name)
+ return RecordDescriptor._unpack(name, fields)
+
+ raise Exception("Unknown subtype: %x" % subtype)
+
+ def unpack(self, d):
+ return unpackb(d, ext_hook=self.unpack_obj, use_list=False)
diff --git a/flow/record/selector.py b/flow/record/selector.py
new file mode 100644
index 0000000..ba00308
--- /dev/null
+++ b/flow/record/selector.py
@@ -0,0 +1,714 @@
+import __future__
+
+import ast
+import operator
+import re
+
+from flow.record.base import GroupedRecord, Record, dynamic_fieldtype
+from flow.record.fieldtypes import net
+from flow.record.whitelist import WHITELIST, WHITELIST_TREE
+
+try:
+ import astor
+ HAVE_ASTOR = True
+except ImportError:
+ HAVE_ASTOR = False
+
+string_types = (str, type(u''))
+
+AST_NODE_S_TYPES = tuple(
+ filter(None, [
+ getattr(ast, "Str", None),
+ getattr(ast, "Bytes", None),
+ ]),
+)
+
+AST_NODE_VALUE_TYPES = tuple(
+ filter(None, [
+ getattr(ast, "NameConstant", None),
+ getattr(ast, "Constant", None),
+ ]),
+)
+
+AST_OPERATORS = {
+ ast.Add: operator.add,
+ ast.Mult: operator.mul,
+ ast.Div: operator.truediv,
+ ast.And: operator.and_,
+ ast.Or: operator.or_,
+ ast.Not: operator.not_,
+ ast.Mod: operator.mod,
+ ast.BitAnd: operator.and_,
+ ast.BitOr: operator.or_,
+}
+
+AST_COMPARATORS = {
+ ast.Eq: operator.eq,
+ ast.In: lambda left, right:
+ False if (isinstance(left, NoneObject) or isinstance(right, NoneObject))
+ else operator.contains(right, left),
+ ast.NotIn: lambda left, right:
+ False if (isinstance(left, NoneObject) or isinstance(right, NoneObject))
+ else operator.contains(right, left) is False,
+ ast.NotEq: operator.ne,
+ ast.Gt: operator.gt,
+ ast.Lt: operator.lt,
+ ast.GtE: operator.ge,
+ ast.LtE: operator.le,
+ ast.Is: operator.is_,
+ ast.IsNot: operator.is_not,
+}
+
+
+class NoneObject:
+ """Returned in the Selector matching if a field does not exist on the Record.
+
+ NoneObject is used to override some comparators like __contains__.
+ """
+
+ def __eq__(a, b):
+ return False
+
+ def __ne__(a, b):
+ return False
+
+ def __lt__(a, b):
+ return False
+
+ def __gt__(a, b):
+ return False
+
+ def __lte__(a, b):
+ return False
+
+ def __gte__(a, b):
+ return False
+
+ def __noteq__(a, b):
+ return False
+
+ def __contains__(a, b):
+ return False
+
+ def __len__(self):
+ return 0
+
+
+NONE_OBJECT = NoneObject()
+
+
+class InvalidSelectorError(Exception):
+ pass
+
+
+class InvalidOperation(Exception):
+ pass
+
+
+def lower(s):
+ """Return lowercased string, otherwise `s` if not string type."""
+ if isinstance(s, string_types):
+ return s.lower()
+ return s
+
+
+def upper(s):
+ """Return uppercased string, otherwise `s` if not string type."""
+ if isinstance(s, string_types):
+ return s.upper()
+ return s
+
+
+def names(r):
+ """Return the available names as a set in the Record otherwise ['UnknownRecord']."""
+ if isinstance(r, GroupedRecord):
+ return set(sub_record._desc.name for sub_record in r.records)
+ if isinstance(r, (Record, WrappedRecord)):
+ return set([r._desc.name])
+ return ["UnknownRecord"]
+
+
+def name(r):
+ """Return the name of the Record otherwise 'UnknownRecord'."""
+ if isinstance(r, (Record, WrappedRecord)):
+ return r._desc.name
+ return "UnknownRecord"
+
+
+def get_type(obj):
+ """Return the type of the Object as 'str'."""
+ return str(type(obj))
+
+
+def has_field(r, field):
+ """Check if field exists on Record object.
+
+ Args:
+ r: Record to match on.
+ field_name: Field name
+
+ Returns:
+ (bool): True if field exists, otherwise False
+
+ """
+ return field in r._desc.fields
+
+
+def field_regex(r, fields, regex):
+ """Check a regex against fields of a Record object.
+
+ Args:
+ r: The record to match on.
+ fields: The fields in the Record to match.
+ regex: The regex pattern to search for.
+
+ Returns:
+ (bool): True or False
+
+ """
+ s_pattern = re.compile(regex)
+ for field in fields:
+ fvalue = getattr(r, field, NONE_OBJECT)
+ if fvalue is NONE_OBJECT:
+ continue
+
+ match = re.search(s_pattern, fvalue)
+ if match is not None:
+ return True
+ return False
+
+
+def field_equals(r, fields, strings, nocase=True):
+ """Check for exact string matches on fields of a Record object.
+
+ Args:
+ r: The record to match on.
+ fields: The fields in the Record to match.
+ strings: The strings to search for.
+ nocase: Should the matching be case insensitive.
+
+ Returns:
+ (bool): True or False
+
+ """
+ if nocase:
+ strings_to_check = [lower(s) for s in strings]
+ else:
+ strings_to_check = strings
+
+ for field in fields:
+ fvalue = getattr(r, field, NONE_OBJECT)
+ if fvalue is NONE_OBJECT:
+ continue
+ if nocase:
+ fvalue = lower(fvalue)
+ for s in strings_to_check:
+ if s == fvalue:
+ return True
+ return False
+
+
+def field_contains(r, fields, strings, nocase=True, word_boundary=False):
+ """Check if the string matches on fields of a Record object.
+
+ Only supports strings for now and partial matches using the __contains__ operator.
+
+ * `fields` is a list of field names to check
+ * `strings` is a list of strings to check on the fields
+ * `word_boundary` is a boolean. True if matching required only word boundary matches.
+ * Non existing fields on the Record object are skipped.
+ * Defaults to case-insensitive matching, use `nocase=False` if you want to be case sensitive.
+ """
+ if nocase:
+ strings_to_check = [lower(s) for s in strings]
+ else:
+ strings_to_check = strings
+
+ for field in fields:
+ fvalue = getattr(r, field, NONE_OBJECT)
+ if fvalue is NONE_OBJECT:
+ continue
+ if nocase:
+ fvalue = lower(fvalue)
+ for s in strings_to_check:
+ if word_boundary is False:
+ if s in fvalue:
+ return True
+ else:
+ if fvalue is None:
+ if s is None:
+ return True
+ continue
+
+ if not isinstance(fvalue, string_types):
+ continue
+
+ s_pattern = u"\\b{}\\b".format(re.escape(s))
+ match = re.search(s_pattern, fvalue)
+ if match is not None:
+ return True
+ return False
+
+
+# Function whitelist that are allowed in selectors
+FUNCTION_WHITELIST = [
+ lower, upper, name, names, get_type, field_contains, field_equals, field_regex, has_field,
+]
+
+
+def resolve_attr_path(node):
+ """Resolve a node attribute to full path, eg: net.ipv4.Subnet."""
+ x = node.func
+ attr_path = []
+ while isinstance(x, ast.Attribute):
+ attr_path.append(x.attr)
+ x = x.value
+ if isinstance(x, ast.Name):
+ attr_path.append(x.id)
+ return '.'.join(reversed(attr_path))
+
+
+class SelectorResult:
+
+ def __init__(self, expression_str, match_result, backtrace, referenced_fields):
+ self.expresssion_str = expression_str
+ self.result = match_result
+ self.backtrace_info = backtrace
+ self.referenced_fields = referenced_fields
+
+ def backtrace(self):
+ result = u""
+ max_source_line_length = len(self.expresssion_str)
+ for row in self.backtrace_info[::-1]:
+ result += u"{}-> {}\n".format(
+ row[0].rstrip().ljust(max_source_line_length + 15),
+ row[1])
+ return result
+
+
+class Selector:
+ VERBOSITY_ALL = 1
+ VERBOSITY_BRANCHES = 2
+ VERBOSITY_NONE = 3
+
+ def __init__(self, expression):
+ expression = expression or "True"
+ self.expression_str = expression
+ self.expression = compile(
+ source=expression,
+ filename="",
+ mode="eval",
+ flags=ast.PyCF_ONLY_AST | __future__.unicode_literals.compiler_flag,
+ )
+ self.matcher = None
+
+ def __str__(self):
+ return self.expression_str
+
+ def __repr__(self):
+ return 'Selector({!r})'.format(self.expression_str)
+
+ def __contains__(self, record):
+ return self.match(record)
+
+ def explain_selector(self, record, verbosity=VERBOSITY_ALL):
+ matcher = RecordContextMatcher(self.expression, self.expression_str, backtrace_verbosity=verbosity)
+ match_result = matcher.matches(record)
+ backtrace_info = matcher.selector_backtrace
+ if not HAVE_ASTOR:
+ backtrace_info.append(("WARNING: astor module not installed, trace not available", False))
+ return SelectorResult(self.expression_str, match_result, backtrace_info, [])
+
+ def match(self, record):
+ if not self.matcher:
+ self.matcher = RecordContextMatcher(self.expression, self.expression_str)
+
+ result = self.matcher.matches(record)
+ return result
+
+
+class WrappedRecord:
+ """WrappedRecord wraps a Record but will return a NoneObject for non existing attributes."""
+
+ __slots__ = ("record", )
+
+ def __init__(self, record):
+ self.record = record
+
+ def __getattr__(self, k):
+ return getattr(self.record, k, NONE_OBJECT)
+
+
+class CompiledSelector:
+ """CompiledSelector is faster than Selector but unsafe if you don't trust the query."""
+
+ def __init__(self, expression):
+ self.expression = expression or None
+ self.code = None
+ self.ns = {func.__name__: func for func in FUNCTION_WHITELIST}
+ self.ns["net"] = net
+
+ if expression:
+ self.code = compile(
+ source=expression,
+ filename="",
+ mode="eval",
+ flags=__future__.unicode_literals.compiler_flag,
+ )
+
+ def __str__(self):
+ return self.expression
+
+ def __repr__(self):
+ return 'CompiledSelector({!r})'.format(self.expression)
+
+ def __contains__(self, record):
+ return self.match(record)
+
+ def match(self, record):
+ if self.code is None:
+ return True
+ ns = self.ns.copy()
+ ns.update({
+ "r": WrappedRecord(record),
+ "Type": TypeMatcher(record),
+ })
+ return eval(self.code, ns)
+
+
+class TypeMatcher:
+ """
+ Helper to get and check fields of a certain type.
+
+ Types can be selected using `Type.`. Attributes can be selected
+ using `Type..`.
+
+ For example `Type.uri.filename` will retrieve all the filenames from all
+ uri's in a record.
+
+ These selectors can also still be used in other helper functions, as
+ they will unwrap to resulting fieldnames. So for example, you can still
+ do `field_contains(r, Type.string, ['something'])`, which will check
+ all `string` fields.
+
+ Membership tests also work. `'something' in Type.string` will perform
+ a membership test in each string value and return True if there are any.
+
+ Reverse membership tests are trickier, and only work with a non-compiled
+ Selector. For example, `Type.net.ipv4.Address in net.ipv4.Subnet('10.0.0.0/8')`
+ requires the TypeMatcher to unroll its values, which is only possible
+ when overriding this behaviour.
+ """
+
+ def __init__(self, rec):
+ self._rec = rec
+
+ def __getattr__(self, attr):
+ if attr in WHITELIST_TREE:
+ return TypeMatcherInstance(self._rec, [attr])
+
+ return NONE_OBJECT
+
+
+class TypeMatcherInstance:
+
+ def __init__(self, rec, ftypeparts=None, attrs=None):
+ self._rec = rec
+ self._ftypeparts = ftypeparts or []
+ self._attrs = attrs or []
+
+ self._ftype = None
+ self._ftypetree = WHITELIST_TREE
+ for p in ftypeparts:
+ self._ftypetree = self._ftypetree[p]
+
+ if self._ftypetree is True:
+ self._ftype = '.'.join(ftypeparts)
+
+ def __getattr__(self, attr):
+ if not self._ftype:
+ if attr not in self._ftypetree:
+ return NONE_OBJECT
+
+ ftypeparts = self._ftypeparts + [attr]
+ return TypeMatcherInstance(self._rec, ftypeparts)
+ elif not attr.startswith('_'):
+ attrs = self._attrs + [attr]
+ return TypeMatcherInstance(self._rec, self._ftypeparts, attrs)
+
+ return NONE_OBJECT
+
+ def __iter__(self):
+ return self._fields()
+
+ def _fields(self):
+ for f in self._rec._desc.getfields(self._ftype):
+ yield f.name
+
+ def _values(self):
+ for f in self._fields():
+ obj = getattr(self._rec, f, NONE_OBJECT)
+ for a in self._attrs:
+ obj = getattr(obj, a, NONE_OBJECT)
+
+ if obj is NONE_OBJECT:
+ continue
+
+ yield obj
+
+ def _subrecords(self):
+ """Return all fields that are records (records in records).
+
+ Returns: list of records
+ """
+ fields = self._rec._desc.getfields("record")
+ for f in fields:
+ r = getattr(self._rec, f.name)
+ if r is not None:
+ yield r
+
+ fields = self._rec._desc.getfields("record[]")
+ for f in fields:
+ records = getattr(self._rec, f.name)
+ if records is not None:
+ for r in records:
+ yield r
+
+ def _op(self, op, other):
+ for v in self._values():
+ if op(v, other):
+ return True
+
+ subrecords = self._subrecords()
+ for record in subrecords:
+ type_matcher = TypeMatcherInstance(record, self._ftypeparts, self._attrs)
+ if type_matcher._op(op, other):
+ return True
+
+ return False
+
+ def __eq__(self, other):
+ return self._op(operator.eq, other)
+
+ def __ne__(self, other):
+ return self._op(operator.ne, other)
+
+ def __lt__(self, other):
+ return self._op(operator.lt, other)
+
+ def __gt__(self, other):
+ return self._op(operator.gt, other)
+
+ def __lte__(self, other):
+ return self._op(operator.le, other)
+
+ def __gte__(self, other):
+ return self._op(operator.ge, other)
+
+ def __noteq__(self, other):
+ return self._op(operator.ne, other)
+
+ def __contains__(self, other):
+ return self._op(operator.contains, other)
+
+
+class RecordContextMatcher:
+
+ def __init__(self, expr, expr_str, backtrace_verbosity=Selector.VERBOSITY_NONE):
+ self.expression = expr
+ self.expression_str = expr_str
+ self.selector_backtrace = []
+ self.selector_backtrace_verbosity = backtrace_verbosity
+ self.data = {}
+ self.rec = None
+
+ def matches(self, rec):
+ self.selector_backtrace = []
+ self.data = {
+ "None": None,
+ "True": True,
+ "False": False,
+ "str": str,
+ "fields": rec._desc.getfields,
+ "any": any,
+ "all": all,
+ }
+
+ # Add whitelisted functions to global dict
+ self.data.update({
+ func.__name__: func for func in FUNCTION_WHITELIST
+ })
+
+ self.data["r"] = rec
+ self.rec = rec
+
+ # This ensures backwards compatibility with old Selector queries
+ self.data["obj"] = rec
+
+ # Type matcher
+ self.data["Type"] = TypeMatcher(rec)
+
+ return self.eval(self.expression.body)
+
+ def eval(self, node):
+ r = self._eval(node)
+ verbosity = self.selector_backtrace_verbosity
+ log_trace = (
+ (verbosity == Selector.VERBOSITY_ALL) or
+ (verbosity == Selector.VERBOSITY_BRANCHES and isinstance(node, (ast.Compare, ast.BoolOp)))
+ )
+ if log_trace and HAVE_ASTOR:
+ source_line = astor.to_source(node)
+ self.selector_backtrace.append((source_line, r))
+ return r
+
+ def _eval(self, node):
+ if isinstance(node, ast.Num):
+ return node.n
+ elif isinstance(node, AST_NODE_S_TYPES):
+ return node.s
+ elif isinstance(node, AST_NODE_VALUE_TYPES):
+ return node.value
+ elif isinstance(node, ast.List):
+ return list(map(self.eval, node.elts))
+ elif isinstance(node, ast.Tuple):
+ return tuple(map(self.eval, node.elts))
+ elif isinstance(node, ast.Name):
+ if node.id not in self.data:
+ return getattr(dynamic_fieldtype, node.id)
+
+ return self.data[node.id]
+ elif isinstance(node, ast.Attribute):
+ if node.attr.startswith('__'):
+ raise InvalidOperation(
+ "Selector {!r} contains invalid attribute: {!r}".format(
+ self.expression_str, node.attr))
+
+ obj = self.eval(node.value)
+
+ return getattr(obj, node.attr, NONE_OBJECT)
+ elif isinstance(node, ast.BoolOp):
+ values = []
+ for expr in node.values:
+ try:
+ value = self.eval(expr)
+ except TypeError as e:
+ if 'NoneType' in str(e):
+ value = False
+ else:
+ raise
+ value = bool(value)
+ values.append(value)
+ result = values.pop(0)
+ for value in values:
+ result = AST_OPERATORS[type(node.op)](result, value)
+ return result
+ elif isinstance(node, ast.BinOp):
+ left = self.eval(node.left)
+ right = self.eval(node.right)
+ if isinstance(left, NoneObject) or isinstance(right, NoneObject):
+ return False
+ return AST_OPERATORS[type(node.op)](left, right)
+ elif isinstance(node, ast.UnaryOp):
+ return AST_OPERATORS[type(node.op)](self.eval(node.operand))
+ elif isinstance(node, ast.Compare):
+ left = self.eval(node.left)
+ right = self.eval(node.comparators[0])
+
+ # print [AST_COMPARATORS[type(node.ops[0])](getattr(self.rec, l.name), right) for l in left]
+ # return [AST_COMPARATORS[type(node.ops[0])](getattr(self.rec, l.name), right) for l in left]
+
+ comptype = type(node.ops[0])
+ comp = AST_COMPARATORS[comptype]
+
+ # Special case for __contains__, where we need to first unwrap all values matching the Type query
+ if comptype in (ast.In, ast.NotIn) and isinstance(left, TypeMatcherInstance):
+ for v in left._values():
+ if comp(v, right):
+ return True
+ return False
+ return comp(left, right)
+ elif isinstance(node, ast.Call):
+ if not isinstance(node.func, (ast.Attribute, ast.Name)):
+ raise InvalidOperation("Error, only ast.Attribute or ast.Name are expected")
+
+ func_name = resolve_attr_path(node)
+ if not (callable(self.data.get(func_name)) or func_name in WHITELIST):
+ raise InvalidOperation(
+ "Call '{}' not allowed. No calls other then whitelisted 'global' calls allowed!".format(
+ func_name))
+
+ func = self.eval(node.func)
+
+ args = list(map(self.eval, node.args))
+ kwargs = dict((kw.arg, self.eval(kw.value)) for kw in node.keywords)
+
+ return func(*args, **kwargs)
+
+ elif isinstance(node, ast.comprehension):
+ iter = self.eval(node.iter)
+ return iter
+
+ elif isinstance(node, ast.GeneratorExp):
+ def recursive_generator(gens):
+ """
+ Yield all the values in the most deepest generator.
+
+ Example:
+ [ord(c) for line in file for c in line]
+ This function would yield all c values for this expression
+
+ Args:
+ gens: A list of generator/ comprehension objects
+
+ Returns:
+ Generator
+ """
+ gens = list(gens)
+ gen = gens.pop()
+ loop_index_var_name = gen.target.id
+ resolved_gen = self.eval(gen)
+ if resolved_gen is not NONE_OBJECT:
+ for val in resolved_gen:
+ self.data[loop_index_var_name] = val
+ if len(gens) > 0:
+ for subval in recursive_generator(gens):
+ yield subval
+ else:
+ yield val
+
+ def generator_expr():
+ """
+ Embedded generator logic for ast.GeneratorExp.
+
+ A function can't yield and return so we write nested generator function and return that.
+
+ Returns:
+ yields evaluated generator expression values
+
+ """
+ for gen in node.generators:
+ if gen.target.id in self.data:
+ raise InvalidOperation(
+ "Generator variable '{}' overwrites existing variable!".format(
+ gen.target.id))
+ values = recursive_generator(node.generators[::-1])
+ for val in values:
+ result = self.eval(node.elt)
+ yield result
+ return generator_expr()
+
+ raise TypeError(node)
+
+
+def make_selector(selector, force_compiled=False):
+ """Return a Selector object (either CompiledSelector or Selector)."""
+ ret = selector
+ if not selector:
+ ret = None
+ elif isinstance(selector, string_types):
+ ret = CompiledSelector(selector) if force_compiled else Selector(selector)
+ elif isinstance(selector, Selector):
+ if force_compiled:
+ ret = CompiledSelector(selector.expression_str)
+ return ret
diff --git a/flow/record/stream.py b/flow/record/stream.py
new file mode 100644
index 0000000..5723aec
--- /dev/null
+++ b/flow/record/stream.py
@@ -0,0 +1,293 @@
+from __future__ import print_function
+
+import os
+import sys
+import struct
+import logging
+import datetime
+from functools import lru_cache
+from collections import ChainMap
+
+from .base import RecordDescriptor, RecordReader
+from .packer import RecordPacker
+from flow.record import RecordWriter
+from flow.record.selector import make_selector
+from flow.record.fieldtypes import fieldtype_for_value
+
+
+log = logging.getLogger(__package__)
+
+RECORDSTREAM_MAGIC = b"RECORDSTREAM\n"
+
+
+def RecordOutput(fp):
+ """Return a RecordPrinter if `fp` is a tty otherwise a RecordStreamWriter."""
+ if hasattr(fp, "isatty") and fp.isatty():
+ return RecordPrinter(fp)
+ return RecordStreamWriter(fp)
+
+
+class RecordPrinter:
+ """Records are printed as textual representation (repr) to fp."""
+
+ fp = None
+
+ def __init__(self, fp, flush=True):
+ self.fp = fp
+ self.auto_flush = flush
+
+ def write(self, obj):
+ buf = repr(obj).encode() + b"\n"
+ self.fp.write(buf)
+ if self.auto_flush:
+ self.flush()
+
+ def flush(self):
+ self.fp.flush()
+
+ def close(self):
+ pass
+
+
+class RecordStreamWriter:
+ """Records are written as binary (serialized) to fp."""
+
+ fp = None
+ packer = None
+
+ def __init__(self, fp):
+ self.fp = fp
+ self.packer = RecordPacker()
+ self.packer.on_descriptor.add_handler(self.on_new_descriptor)
+ self.header_written = False
+
+ def __del__(self):
+ self.close()
+
+ def on_new_descriptor(self, descriptor):
+ self.write(descriptor)
+
+ def close(self):
+ if self.fp and self.fp != getattr(sys.stdout, "buffer", sys.stdout):
+ self.fp.close()
+ self.fp = None
+
+ def flush(self):
+ if not self.header_written:
+ self.writeheader()
+
+ def write(self, obj):
+ if not self.header_written:
+ self.writeheader()
+ blob = self.packer.pack(obj)
+ self.fp.write(struct.pack(">I", len(blob)))
+ self.fp.write(blob)
+
+ def writeheader(self):
+ self.header_written = True
+ self.write(RECORDSTREAM_MAGIC)
+
+
+class RecordStreamReader:
+ fp = None
+ recordtype = None
+ descs = None
+ packer = None
+
+ def __init__(self, fp, selector=None):
+ self.fp = fp
+ self.closed = False
+ self.selector = make_selector(selector)
+ self.packer = RecordPacker()
+ self.readheader()
+
+ def readheader(self):
+ # Manually read the msgpack format to avoid unserializing invalid data
+ # we read size (4) + msgpack type (2) + msgpack bytes (recordstream magic)
+ header = self.fp.read(4 + 2 + len(RECORDSTREAM_MAGIC))
+ if not header.endswith(RECORDSTREAM_MAGIC):
+ raise IOError("Unknown file format, not a RecordStream")
+
+ def read(self):
+ d = self.fp.read(4)
+ if len(d) != 4:
+ raise EOFError()
+
+ size = struct.unpack(">I", d)[0]
+ d = self.fp.read(size)
+ return self.packer.unpack(d)
+
+ def close(self):
+ self.closed = True
+
+ def __iter__(self):
+ try:
+ while not self.closed:
+ obj = self.read()
+ if obj == RECORDSTREAM_MAGIC:
+ continue
+ if isinstance(obj, RecordDescriptor):
+ self.packer.register(obj)
+ else:
+ if not self.selector or self.selector.match(obj):
+ yield obj
+ except EOFError:
+ pass
+
+
+def record_stream(sources, selector=None):
+ """Return a Record stream generator from the given Record sources.
+
+ Exceptions in a Record source will be caught so the stream is not interrupted.
+ """
+ log.debug("Record stream with selector: {!r}".format(selector))
+ for src in sources:
+ # Inform user that we are reading from stdin
+ if src in ("-", ""):
+ print("[reading from stdin]", file=sys.stderr)
+
+ # Initial value for reader, in case of exception message
+ reader = "RecordReader"
+ try:
+ reader = RecordReader(src, selector=selector)
+ for rec in reader:
+ yield rec
+ reader.close()
+ except IOError as e:
+ log.error("{}({!r}): {}".format(reader, src, e))
+ except KeyboardInterrupt:
+ raise
+ except Exception as e: # noqa: B902
+ log.warning(
+ "Exception in {!r} for {!r}: {!r} -- skipping to next reader".format(
+ reader, src, e))
+ continue
+
+
+class PathTemplateWriter:
+ """Write records to a path on disk, path can be a template string.
+
+ This allows for archiving records on disk based on timestamp for example.
+
+ Default template string is:
+
+ '{name}-{record._generated:%Y%m%dT%H}.records.gz'
+
+ Available template fields:
+
+ `name` defaults to "records", but can be overridden in the initializer.
+ `record` is the record object
+ `ts` is record._generated
+
+ If the destination path already exists it will rename the existing file using the current datetime.
+ """
+
+ DEFAULT_TEMPLATE = '{name}-{record._generated:%Y%m%dT%H}.records.gz'
+
+ def __init__(self, path_template=None, name=None):
+ self.path_template = path_template or self.DEFAULT_TEMPLATE
+ self.name = name or "records"
+ self.current_path = None
+ self.writer = None
+ self.stream = None
+
+ def rotate_existing_file(self, path):
+ if os.path.exists(path):
+ now = datetime.datetime.utcnow()
+ src = os.path.realpath(path)
+
+ src_dir = os.path.dirname(src)
+ src_fname = os.path.basename(src)
+
+ # stamp will be part of new filename to denote rotation stamp
+ stamp = '{now:%Y%m%dT%H%M%S}'.format(now=now)
+
+ # Use "records.gz" as the extension if we have this naming convention
+ if src_fname.endswith('.records.gz'):
+ fname, _ = src_fname.rsplit('.records.gz', 1)
+ ext = "records.gz"
+ else:
+ fname, ext = os.path.splitext(src_fname)
+
+ # insert the rotation stamp into the new filename.
+ dst = os.path.join(src_dir, '{fname}.{stamp}.{ext}'.format(**locals()))
+ log.info('RENAME {!r} -> {!r}'.format(src, dst))
+ os.rename(src, dst)
+
+ def record_stream_for_path(self, path):
+ if self.current_path != path:
+ self.current_path = path
+ log.info('Writing records to {!r}'.format(path))
+ self.rotate_existing_file(path)
+ dst_dir = os.path.dirname(path)
+ if not os.path.exists(dst_dir):
+ os.makedirs(dst_dir)
+ rs = RecordWriter(path)
+ self.close()
+ self.writer = rs
+ return self.writer
+
+ def write(self, record):
+ ts = record._generated or datetime.datetime.utcnow()
+ path = self.path_template.format(name=self.name, record=record, ts=ts)
+ rs = self.record_stream_for_path(path)
+ rs.write(record)
+ rs.fp.flush()
+
+ def close(self):
+ if self.writer:
+ self.writer.close()
+
+
+class RecordArchiver(PathTemplateWriter):
+ """RecordWriter that writes/archives records to a path with YYYY/mm/dd."""
+
+ def __init__(self, archive_path, path_template=None, name=None):
+ path_template = path_template or self.DEFAULT_TEMPLATE
+ template = os.path.join(str(archive_path), "{ts:%Y/%m/%d}", path_template)
+ PathTemplateWriter.__init__(self, path_template=template, name=name)
+
+
+class RecordFieldRewriter:
+ """Rewrite records using a new RecordDescriptor for chosen fields and/or excluded or new record fields."""
+
+ def __init__(self, fields=None, exclude=None, expression=None):
+ self.fields = fields or []
+ self.exclude = exclude or []
+ self.expression = compile(expression, '', 'exec') if expression else None
+
+ @lru_cache(maxsize=256)
+ def record_descriptor_for_fields(self, descriptor, fields=None, exclude=None, new_fields=None):
+ if not fields and not exclude and not new_fields:
+ return descriptor
+ exclude = exclude or []
+ desc_fields = []
+ if fields:
+ for fname in fields:
+ if fname in exclude:
+ continue
+ field = descriptor.fields.get(fname, None)
+ if field:
+ desc_fields.append((field.typename, field.name))
+ else:
+ desc_fields = [(ftype, fname) for (ftype, fname) in descriptor.get_field_tuples() if fname not in exclude]
+ if new_fields:
+ desc_fields.extend(new_fields)
+ return RecordDescriptor(descriptor.name, desc_fields)
+
+ def rewrite(self, record):
+ if not self.fields and not self.exclude and not self.expression:
+ return record
+
+ local_dict = {}
+ new_fields = []
+ if self.expression:
+ exec(self.expression, record._asdict(), local_dict)
+ # convert new variables to new record fields (field type is derived from value)
+ new_fields = [(fieldtype_for_value(val, "string"), key) for key, val in local_dict.items()]
+
+ RewriteRecord = self.record_descriptor_for_fields(
+ record._desc, tuple(self.fields), tuple(self.exclude), tuple(new_fields)
+ )
+ # give new variables precendence
+ return RewriteRecord.init_from_dict(ChainMap(local_dict, record._asdict()))
diff --git a/flow/record/tools/__init__.py b/flow/record/tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/flow/record/tools/geoip.py b/flow/record/tools/geoip.py
new file mode 100644
index 0000000..3a940fa
--- /dev/null
+++ b/flow/record/tools/geoip.py
@@ -0,0 +1,194 @@
+# Python imports
+import re
+import sys
+import random
+import argparse
+import logging
+
+# Flow imports
+from flow.record.utils import catch_sigpipe
+from flow.record import (
+ RecordDescriptor,
+ RecordWriter,
+ record_stream,
+ extend_record,
+)
+
+# Third party imports
+import maxminddb
+
+
+logger = logging.getLogger(__name__)
+
+IPv4Record = RecordDescriptor(
+ "geo/ipv4",
+ [
+ ("net.ipaddress", "ip"),
+ ],
+)
+
+GeoRecord = RecordDescriptor(
+ "maxmind/geo",
+ [
+ ("string", "country"),
+ ("string", "country_code"),
+ ("string", "city"),
+ ("float", "longitude"),
+ ("float", "latitude"),
+ ],
+)
+
+AsnRecord = RecordDescriptor(
+ "maxmind/asn",
+ [
+ ("string", "asn"),
+ ("string", "org"),
+ ],
+)
+
+DEFAULT_CITY_DB = "/usr/share/GeoIP/GeoLite2-City.mmdb"
+DEFAULT_ASN_DB = "/usr/share/GeoIP/GeoLite2-ASN.mmdb"
+REGEX_IPV4 = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
+
+
+def georecord_for_ip(city_db, ip):
+ r = city_db.get(ip) if city_db else None
+ if not r:
+ return GeoRecord()
+
+ loc_dict = r.get("location", {})
+ country_dict = r.get("country", {})
+ city_dict = r.get("city", {})
+
+ country = country_dict.get("names", {}).get("en")
+ country_code = country_dict.get("iso_code")
+ city = city_dict.get("names", {}).get("en")
+ lon = loc_dict.get("longitude")
+ lat = loc_dict.get("latitude")
+
+ return GeoRecord(
+ country=country,
+ country_code=country_code,
+ city=city,
+ longitude=lon,
+ latitude=lat,
+ )
+
+
+def asnrecord_for_ip(asn_db, ip):
+ r = asn_db.get(ip) if asn_db else None
+ if not r:
+ return AsnRecord()
+ asn = r.get("autonomous_system_number", None)
+ org = r.get("autonomous_system_organization", None)
+ return AsnRecord(asn=asn, org=org)
+
+
+def ip_records_from_text_files(files):
+ """Yield IPv4Records by extracting IP addresses from `files` using a regex."""
+ for fname in files:
+ with open(fname, "r") if fname != "-" else sys.stdin as f:
+ for line in f:
+ for ip in REGEX_IPV4.findall(line):
+ yield IPv4Record(ip)
+
+
+@catch_sigpipe
+def main():
+ parser = argparse.ArgumentParser(
+ description="Annotate records with GeoIP and ASN data",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+ parser.add_argument(
+ "-c", "--city-db", default=DEFAULT_CITY_DB, help="path to GeoIP city database"
+ )
+ parser.add_argument(
+ "-a", "--asn-db", default=DEFAULT_ASN_DB, help="path to GeoIP ASN database"
+ )
+ parser.add_argument(
+ "-i",
+ "--ip-field",
+ metavar="FIELD",
+ default="ip",
+ help="the source record field to use for lookups",
+ )
+ parser.add_argument(
+ "-w",
+ "--writer",
+ metavar="OUTPUT",
+ default="-",
+ help="write records to output",
+ )
+ parser.add_argument("input", nargs="*", default=["-"], help="input files")
+ parser.add_argument(
+ "-t",
+ "--text",
+ action="store_true",
+ help="treats input as text and extract IPv4 Records using regex",
+ )
+
+ # Hidden options
+ parser.add_argument(
+ "-m", "--mode", type=int, default=maxminddb.MODE_AUTO, help=argparse.SUPPRESS
+ )
+ parser.add_argument("-g", "--generate", action="store_true", help=argparse.SUPPRESS)
+ args = parser.parse_args()
+
+ if args.generate:
+ with RecordWriter() as writer:
+ while True:
+ record = IPv4Record(random.randint(0, 0xFFFFFFFF))
+ writer.write(record)
+
+ if args.mode:
+ logger.warning("MODE: %u", args.mode)
+
+ try:
+ city_db = maxminddb.open_database(args.city_db, args.mode)
+ except FileNotFoundError:
+ logger.warning(
+ "[*] Disabled Geo record annotation. (database not found: %r)",
+ args.city_db,
+ )
+ city_db = None
+
+ try:
+ asn_db = maxminddb.open_database(args.asn_db, args.mode)
+ except FileNotFoundError:
+ logger.warning(
+ "[*] Disabled ASN record annotation. (database not found: %r)", args.asn_db
+ )
+ asn_db = None
+
+ if not any([city_db, asn_db]) and not args.text:
+ print(
+ "[!] Both City and ASN database not available. Nothing to annotate, exiting..",
+ file=sys.stderr,
+ )
+ return 1
+
+ if args.text:
+ # Input are text files, extract IPv4Records from text using a regex
+ record_iterator = ip_records_from_text_files(args.input)
+ else:
+ # Input are Record files
+ record_iterator = record_stream(args.input)
+
+ with RecordWriter(args.writer) as writer:
+ for record in record_iterator:
+ ip = getattr(record, args.ip_field, None)
+
+ annotated_records = []
+ if city_db:
+ geo_record = georecord_for_ip(city_db, str(ip)) if ip else GeoRecord()
+ annotated_records.append(geo_record)
+ if asn_db:
+ asn_record = asnrecord_for_ip(asn_db, str(ip)) if ip else AsnRecord()
+ annotated_records.append(asn_record)
+
+ record = extend_record(record, annotated_records)
+ writer.write(record)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/flow/record/tools/rdump.py b/flow/record/tools/rdump.py
new file mode 100644
index 0000000..3d550e9
--- /dev/null
+++ b/flow/record/tools/rdump.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+from __future__ import print_function
+
+import sys
+import logging
+
+from flow.record import RecordWriter, record_stream
+from flow.record.stream import RecordFieldRewriter
+from flow.record.selector import make_selector
+from flow.record.utils import catch_sigpipe
+
+try:
+ from flow.record.version import version
+except ImportError:
+ version = "unknown"
+
+log = logging.getLogger(__name__)
+
+try:
+ # Python 2
+ import urlparse
+ from urllib import urlencode
+except ImportError:
+ # Python 3
+ import urllib.parse as urlparse
+ from urllib.parse import urlencode
+
+
+@catch_sigpipe
+def main():
+ import argparse
+ parser = argparse.ArgumentParser(
+ description="Record dumper, a tool that can read, write and filter records",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ parser.add_argument(
+ '--version', action='version', version="flow.record version {}".format(version))
+ parser.add_argument(
+ 'src', metavar='SOURCE', nargs='*', default=['-'],
+ help='Record source')
+ parser.add_argument(
+ '-v', '--verbose', action='count', default=0,
+ help='Increase verbosity')
+
+ misc = parser.add_argument_group("miscellaneous")
+ misc.add_argument(
+ '-l', '--list', action='store_true',
+ help='List unique Record Descriptors')
+ misc.add_argument(
+ '-n', '--no-compile', action='store_true',
+ help="Don't use a compiled selector (safer, but slower)")
+ misc.add_argument(
+ '--record-source', default=None,
+ help='Overwrite the record source field')
+ misc.add_argument(
+ '--record-classification', default=None,
+ help='Overwrite the record classification field')
+
+ selection = parser.add_argument_group('selection')
+ selection.add_argument(
+ '-F', '--fields', metavar='FIELDS',
+ help='Fields (comma seperated) to output in dumping')
+ selection.add_argument(
+ '-X', '--exclude', metavar='FIELDS',
+ help='Fields (comma seperated) to exclude in dumping')
+ selection.add_argument(
+ '-s', '--selector', metavar='SELECTOR', default=None,
+ help='Only output records matching Selector')
+
+ output = parser.add_argument_group('output control')
+ output.add_argument(
+ '-f', '--format', metavar='FORMAT',
+ help='Format string')
+ output.add_argument(
+ '-c', '--count', type=int,
+ help='Exit after COUNT records')
+ output.add_argument(
+ '-w', '--writer', metavar='OUTPUT', default=None,
+ help='Write records to output')
+ output.add_argument(
+ '-m', '--mode', default=None, choices=("csv", "json", "jsonlines", "line"),
+ help='Output mode')
+
+ advanced = parser.add_argument_group('advanced')
+ advanced.add_argument(
+ '-E', "--exec-expression",
+ help="execute a (Python) expression for each record AFTER selector matching, can be used to assign new fields")
+
+ aliases = parser.add_argument_group('aliases')
+ aliases.add_argument(
+ '-j', '--json', action='store_const', const='json', dest='mode',
+ default=argparse.SUPPRESS,
+ help='Short for --mode=json')
+ aliases.add_argument(
+ '-J', '--jsonlines', action='store_const', const='jsonlines', dest='mode',
+ default=argparse.SUPPRESS,
+ help='Short for --mode=jsonlines')
+ aliases.add_argument(
+ '-C', '--csv', action='store_const', const='csv', dest='mode',
+ default=argparse.SUPPRESS,
+ help='Short for --mode=csv')
+ aliases.add_argument(
+ "-L", "--line", action='store_const', const='line', dest='mode',
+ default=argparse.SUPPRESS,
+ help='Short for --mode=line')
+
+ args = parser.parse_args()
+
+ levels = [logging.WARNING, logging.INFO, logging.DEBUG]
+ level = levels[min(len(levels) - 1, args.verbose)]
+ logging.basicConfig(level=level, format="%(asctime)s %(levelname)s %(message)s")
+
+ fields_to_exclude = args.exclude.split(",") if args.exclude else []
+ fields = args.fields.split(",") if args.fields else []
+
+ uri = args.writer or "text://"
+ if not args.writer:
+ mode_to_uri = {
+ "csv": "csvfile://",
+ "json": "jsonfile://?indent=2",
+ "jsonlines": "jsonfile://",
+ "line": "line://",
+ }
+ uri = mode_to_uri.get(args.mode, uri)
+ qparams = {
+ "fields": args.fields,
+ "exclude": args.exclude,
+ "format_spec": args.format,
+ }
+ query = urlencode({k: v for k, v in qparams.items() if v})
+ uri += "&" if urlparse.urlparse(uri).query else "?" + query
+
+ record_field_rewriter = None
+ if fields or fields_to_exclude or args.exec_expression:
+ record_field_rewriter = RecordFieldRewriter(fields, fields_to_exclude, args.exec_expression)
+
+ selector = make_selector(args.selector, not args.no_compile)
+ seen_desc = set()
+ count = 0
+ with RecordWriter(uri) as record_writer:
+ for count, rec in enumerate(record_stream(args.src, selector)):
+ if args.count and count >= args.count:
+ break
+
+ if args.record_source is not None:
+ rec._source = args.record_source
+ if args.record_classification is not None:
+ rec._classification = args.record_classification
+ if record_field_rewriter:
+ rec = record_field_rewriter.rewrite(rec)
+
+ # Dump RecordDescriptors
+ if args.list:
+ desc = rec._desc
+ if desc.descriptor_hash not in seen_desc:
+ seen_desc.add(desc.descriptor_hash)
+ print("# {}".format(desc))
+ print(desc.definition())
+ print()
+ continue
+
+ record_writer.write(rec)
+
+ if args.list:
+ print("Processed {} records".format(count))
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/flow/record/utils.py b/flow/record/utils.py
new file mode 100644
index 0000000..bffccc0
--- /dev/null
+++ b/flow/record/utils.py
@@ -0,0 +1,87 @@
+import os
+import sys
+import base64
+from functools import wraps
+
+_native = str
+_unicode = type(u'')
+_bytes = type(b'')
+
+
+def is_stdout(fp):
+ return fp == getattr(sys.stdout, "buffer", sys.stdout)
+
+
+def to_bytes(value):
+ """Convert a value to a byte string."""
+ if value is None or isinstance(value, _bytes):
+ return value
+ if isinstance(value, _unicode):
+ return value.encode("utf-8")
+ return _bytes(value)
+
+
+def to_str(value):
+ """Convert a value to a unicode string."""
+ if value is None or isinstance(value, _unicode):
+ return value
+ if isinstance(value, _bytes):
+ return value.decode("utf-8")
+ return _unicode(value)
+
+
+def to_native_str(value):
+ """Convert a value to a native `str`."""
+ if value is None or isinstance(value, _native):
+ return value
+ if isinstance(value, _unicode):
+ # Python 2: unicode -> str
+ return value.encode("utf-8")
+ if isinstance(value, _bytes):
+ # Python 3: bytes -> str
+ return value.decode("utf-8")
+ return _native(value)
+
+
+def to_base64(value):
+ """Convert a value to a base64 string."""
+ return base64.b64encode(value).decode()
+
+
+def catch_sigpipe(func):
+ """Catches KeyboardInterrupt and BrokenPipeError (OSError 22 on Windows)."""
+
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ try:
+ return func(*args, **kwargs)
+ except KeyboardInterrupt:
+ print("Aborted!", file=sys.stderr)
+ return 1
+ except (BrokenPipeError, OSError) as e:
+ exc_type = type(e)
+ # Only catch BrokenPipeError or OSError 22
+ if (exc_type is BrokenPipeError) or (exc_type is OSError and e.errno == 22):
+ devnull = os.open(os.devnull, os.O_WRONLY)
+ os.dup2(devnull, sys.stdout.fileno())
+ return 1
+ # Raise other exceptions
+ raise
+
+ return wrapper
+
+
+class EventHandler:
+
+ def __init__(self):
+ self.handlers = []
+
+ def add_handler(self, callback):
+ self.handlers.append(callback)
+
+ def remove_handler(self, callback):
+ self.handlers.remove(callback)
+
+ def __call__(self, *args, **kwargs):
+ for h in self.handlers:
+ h(*args, **kwargs)
diff --git a/flow/record/whitelist.py b/flow/record/whitelist.py
new file mode 100644
index 0000000..dee0add
--- /dev/null
+++ b/flow/record/whitelist.py
@@ -0,0 +1,40 @@
+WHITELIST = [
+ "boolean",
+ "dynamic",
+ "datetime",
+ "filesize",
+ "uint16",
+ "uint32",
+ "float",
+ "string",
+ "stringlist",
+ "dictlist",
+ "unix_file_mode",
+ "varint",
+ "wstring",
+ "net.ipv4.Address",
+ "net.ipv4.Subnet",
+ "net.tcp.Port",
+ "net.udp.Port",
+ "uri",
+ "digest",
+ "bytes",
+ "record",
+ "net.ipaddress",
+ "net.ipnetwork",
+ "net.IPAddress",
+ "net.IPNetwork",
+]
+
+
+WHITELIST_TREE = {}
+for field in WHITELIST:
+ parent = None
+ obj = WHITELIST_TREE
+ for part in field.split('.'):
+ if part not in obj:
+ obj[part] = {}
+ parent = obj
+ obj = obj[part]
+
+ parent[part] = True
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..6d6687e
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,9 @@
+[build-system]
+requires = ["setuptools>=43.0.0", "wheel", "setuptools_scm[toml]>=3.4.1"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools_scm]
+write_to = "flow/record/version.py"
+
+[tool.black]
+line-length = 120
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..34ae005
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,9 @@
+[metadata]
+author = Dissect Team
+author_email = dissect@fox-it.com
+url = https://github.com/fox-it/flow.record
+license = Affero General Public License v3
+long_description = file: README.md
+license_files = LICENSE, COPYRIGHT
+classifiers =
+ Programming Language :: Python :: 3
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..4b3a22f
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+
+setup(
+ name='flow.record',
+ packages=['flow.' + v for v in find_packages('flow')],
+ install_requires=[
+ 'msgpack>=0.5.2',
+ ],
+ extras_require={
+ # Note: these compression libraries do not work well with pypy
+ 'compression': [
+ 'lz4',
+ 'zstandard',
+ ],
+ },
+ namespace_packages=['flow'],
+ entry_points={
+ 'console_scripts': [
+ 'r=flow.record.tools.r:main',
+ 'rdd=flow.record.tools.rdd:main',
+ 'rselect=flow.record.tools.rselect:main',
+ 'rdump=flow.record.tools.rdump:main',
+ 'rgeoip=flow.record.tools.geoip:main',
+ ],
+ },
+)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/selector_explain_example.py b/tests/selector_explain_example.py
new file mode 100644
index 0000000..9520b4e
--- /dev/null
+++ b/tests/selector_explain_example.py
@@ -0,0 +1,32 @@
+from flow.record.selector import Selector
+from flow.record import RecordDescriptor
+
+desc = RecordDescriptor("test/samplerecord", [
+ ("uint16", "value"),
+ ("string", "x"),
+])
+
+
+def main():
+ s_str = u"r.x == u'\\u2018Test\\u2019' or r.value == 17 or (r.value == 1337 and r.x == 'YOLO')"
+ print(u"Evaluating selector.... \n{}".format(s_str))
+ print("\n")
+ s = Selector(s_str)
+ obj = desc(0, "Test")
+ obj.x = u"\u2018Test\u2019"
+ obj.value = 16
+ val = s.explain_selector(obj)
+ print(val.backtrace())
+
+
+if __name__ == "__main__":
+ main()
+
+
+"""
+r.x == 'Test' or r.value == 17 -> True
+ r.x == 'Test' -> True
+ or
+ r.value == 17 -> False
+
+"""
diff --git a/tests/standalone_test.py b/tests/standalone_test.py
new file mode 100644
index 0000000..3d8749d
--- /dev/null
+++ b/tests/standalone_test.py
@@ -0,0 +1,16 @@
+from __future__ import print_function
+
+
+def main(glob):
+ for var, val in sorted(glob.items()):
+ if not var.startswith("test_"):
+ continue
+
+ print("{:40s}".format(var), end="")
+ try:
+ val()
+ print("PASSED")
+ except Exception: # noqa: B902
+ print("FAILED")
+ import traceback
+ traceback.print_exc()
diff --git a/tests/test_compiled_selector.py b/tests/test_compiled_selector.py
new file mode 100644
index 0000000..ff8995f
--- /dev/null
+++ b/tests/test_compiled_selector.py
@@ -0,0 +1,37 @@
+from flow.record import RecordDescriptor
+from flow.record.selector import CompiledSelector as Selector
+
+
+def test_selector_func_name():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "query"),
+ ("string", "url"),
+ ])
+ assert TestRecord(None, None) not in Selector("name(r) == 'foo/bar'")
+ assert TestRecord(None, None) in Selector("name(r) == 'test/record'")
+
+
+def test_selector():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "query"),
+ ("string", "url"),
+ ])
+
+ assert TestRecord("foo", "bar") in Selector("r.query == 'foo'")
+ assert TestRecord(None, None) not in Selector("r.query == 'foo'")
+ assert TestRecord(None, None) not in Selector("name(r.query) == 'XX'")
+
+
+def test_non_existing_field():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "query"),
+ ("string", "url"),
+ ])
+
+ assert TestRecord("foo", "bar") not in Selector("r.query and r.non_existing_field")
+ assert TestRecord("foo", "bar") in Selector("not r.non_existing_field")
+ assert TestRecord("foo", "bar") in Selector("r.query and r.url and not r.non_existing_field")
+
+
+if __name__ == "__main__":
+ __import__("standalone_test").main(globals())
diff --git a/tests/test_fieldtype_ip.py b/tests/test_fieldtype_ip.py
new file mode 100644
index 0000000..94a683f
--- /dev/null
+++ b/tests/test_fieldtype_ip.py
@@ -0,0 +1,238 @@
+from __future__ import unicode_literals
+
+import pytest
+
+from flow.record import RecordDescriptor
+from flow.record import RecordPacker
+from flow.record.fieldtypes import net
+from flow.record.selector import Selector, CompiledSelector
+
+
+def test_field_ipaddress():
+ a = net.IPAddress("192.168.1.1")
+ assert a == "192.168.1.1"
+
+ with pytest.raises(ValueError) as excinfo:
+ net.IPAddress("a.a.a.a")
+ excinfo.match(".* does not appear to be an IPv4 or IPv6 address")
+
+
+def test_field_ipnetwork():
+ a = net.IPNetwork("192.168.1.0/24")
+ assert a == "192.168.1.0/24"
+
+ # Host bits set
+ with pytest.raises(ValueError) as excinfo:
+ net.IPNetwork("192.168.1.10/24")
+ excinfo.match(".* has host bits set")
+
+
+def test_record_ipaddress():
+ TestRecord = RecordDescriptor("test/ipaddress", [
+ ("net.ipaddress", "ip"),
+ ])
+
+ r = TestRecord("127.0.0.1")
+ assert r.ip == "127.0.0.1"
+ assert r.ip != "lala.1234.bad.ip"
+ assert isinstance(r.ip, net.ipaddress)
+ assert repr(r.ip) == "net.ipaddress('127.0.0.1')"
+
+ # ipv4
+ assert TestRecord("1.1.1.1").ip == "1.1.1.1"
+ assert TestRecord("0.0.0.0").ip == "0.0.0.0"
+ assert TestRecord("192.168.0.1").ip == "192.168.0.1"
+ assert TestRecord("255.255.255.255").ip == "255.255.255.255"
+
+ # ipv6
+ assert TestRecord("::1").ip == "::1"
+ assert TestRecord("2001:4860:4860::8888").ip == "2001:4860:4860::8888"
+ assert TestRecord("2001:4860:4860::4444").ip == "2001:4860:4860::4444"
+
+ # instantiate from different types
+ assert TestRecord(1).ip == "0.0.0.1"
+ assert TestRecord(0x7f0000ff).ip == "127.0.0.255"
+ assert TestRecord(b"\x7f\xff\xff\xff").ip == "127.255.255.255"
+ assert TestRecord("127.0.0.1").ip == "127.0.0.1"
+
+ # invalid ip addresses
+ for invalid in ["1.1.1.256", "192.168.0.1/24", "a.b.c.d", ":::::1"]:
+ with pytest.raises(Exception) as excinfo:
+ TestRecord(invalid)
+ excinfo.match(r'.*does not appear to be an IPv4 or IPv6 address*')
+
+ r = TestRecord()
+ assert r.ip is None
+
+
+def test_record_ipnetwork():
+ TestRecord = RecordDescriptor("test/ipnetwork", [
+ ("net.ipnetwork", "subnet"),
+ ])
+
+ # ipv4
+ r = TestRecord("192.168.0.0/24")
+ assert r.subnet == "192.168.0.0/24"
+ assert r.subnet != "bad.sub/net"
+ assert "bad.ip" not in r.subnet
+ assert "192.168.0.1" in r.subnet
+ assert "192.168.0.2/32" in r.subnet
+ assert "192.168.0.255" in r.subnet
+ assert "192.168.0.128/30" in r.subnet
+ assert "192.168.1.1" not in r.subnet
+ assert isinstance(r.subnet, net.ipnetwork)
+ assert repr(r.subnet) == "net.ipnetwork('192.168.0.0/24')"
+
+ r = TestRecord("192.168.1.1/32")
+ assert r.subnet == "192.168.1.1"
+ assert r.subnet == "192.168.1.1/32"
+ assert "192.168.1.1" in r.subnet
+ assert "192.168.1.1/32" in r.subnet
+
+ # ipv6 - https://en.wikipedia.org/wiki/IPv6_address
+ r = TestRecord("::1")
+ assert r.subnet == "::1"
+ assert r.subnet == "::1/128"
+
+ r = TestRecord("::/0")
+ assert "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff" in r.subnet
+ assert "::" in r.subnet
+ assert "::1" in r.subnet
+
+ r = TestRecord("64:ff9b::/96")
+ assert "64:ff9b::0.0.0.0" in r.subnet
+ assert "64:ff9b::255.255.255.255" in r.subnet
+
+
+@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector])
+def test_selector_ipaddress(PSelector):
+ TestRecord = RecordDescriptor("test/ipaddress", [
+ ("string", "description"),
+ ("net.ipaddress", "ip"),
+ ])
+
+ records = [
+ TestRecord("Google DNS IPv4", "8.8.8.8"),
+ TestRecord("Google DNS IPv4", "8.8.4.4"),
+ TestRecord("Google DNS IPv6", "2001:4860:4860::8888"),
+ TestRecord("Google DNS IPv6", "2001:4860:4860::4444"),
+ ]
+
+ recs = [r for r in records if r in PSelector("r.ip in net.ipnetwork('8.8.0.0/16')")]
+ assert len(recs) == 2
+
+ recs = [r for r in records if r in PSelector("r.ip == '8.8.8.8'")]
+ assert len(recs) == 1
+
+ recs = [r for r in records if r in PSelector("r.ip in net.ipnetwork('2001:4860:4860::/48')")]
+ assert len(recs) == 2
+
+ record = TestRecord("Optional", None)
+ assert record not in PSelector("r.ip == '1.1.1.1'")
+ assert record in PSelector("r.ip == None")
+ assert record in PSelector("not r.ip")
+
+
+@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector])
+def test_selector_ipnetwork(PSelector):
+ TestRecord = RecordDescriptor("test/ipnetwork", [
+ ("string", "description"),
+ ("net.ipnetwork", "subnet"),
+ ])
+
+ records = [
+ # ipv4
+ TestRecord("RFC1918", "10.0.0.0/8"),
+ TestRecord("RFC1918", "172.16.0.0/12"),
+ TestRecord("RFC1918", "192.168.0.0/16"),
+ # ipv6
+ TestRecord("Private network", "fc00::/7"),
+ TestRecord("Link local", "fe80::/10"),
+ TestRecord("Facebook IPv6 range", "2a03:2880::/32"),
+ ]
+ recs = [r for r in records if r in PSelector("'fe80::1ff:fe23:4567:890a' in r.subnet")]
+ assert len(recs) == 1
+
+ recs = [r for r in records if r in PSelector("'2a03:2880:f003:c07:face:b00c::2' in r.subnet")]
+ assert len(recs) == 1
+
+ recs = [r for r in records if r in PSelector("'192.168.1.0/24' in r.subnet")]
+ assert len(recs) == 1
+ assert recs[0].subnet == "192.168.0.0/16"
+
+ recs = [r for r in records if r in PSelector("'192.168.1.141' in r.subnet")]
+ assert len(recs) == 1
+ assert recs[0].subnet == "192.168.0.0/16"
+
+ record = TestRecord("Google", "8.0.0.0/8")
+ assert record in PSelector("'8.8.4.4' in r.subnet")
+ assert record in PSelector("'8.8.8.8/32' in r.subnet")
+ assert record in PSelector("'8.8.0.0/16' in r.subnet")
+ assert record in PSelector("'8.8.4.0/24' in r.subnet")
+ assert record in PSelector("'8.8.8.0/24' in r.subnet")
+
+ record = TestRecord("Optional", None)
+ assert record not in PSelector("r.subnet and '1.1.1.1' in r.subnet")
+ assert record in PSelector("r.subnet == None")
+ assert record in PSelector("not r.subnet")
+
+
+@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector])
+def test_selector_ipaddress_in_ipnetwork(PSelector):
+ TestRecord = RecordDescriptor("test/scandata", [
+ ("net.ipaddress", "ip"),
+ ("uint16", "port"),
+ ("string", "description"),
+ ])
+
+ records = [
+ TestRecord("8.8.8.8", 53, "google"),
+ TestRecord("1.1.1.1", 53, "cloudflare"),
+ TestRecord("2620:fe::9", 53, "quad9"),
+ TestRecord(None, None, "empty"),
+ ]
+
+ for record in records:
+ if record in PSelector('r.ip in net.ipnetwork("8.8.0.0/16")'):
+ assert record.ip == "8.8.8.8"
+
+ for record in records:
+ if record in PSelector('r.ip in net.ipnetwork("1.1.1.1/32")'):
+ assert record.ip == "1.1.1.1"
+
+ for record in records:
+ if record in PSelector('r.ip in net.ipnetwork("2620:FE::/48")'):
+ assert record.description == "quad9"
+ assert record.ip == "2620:00fe:0:0:0:0:0:0009"
+
+
+def test_pack_ipaddress():
+ packer = RecordPacker()
+
+ TestRecord = RecordDescriptor("test/ipaddress", [
+ ("net.ipaddress", "ip"),
+ ])
+
+ record_in = TestRecord("10.22.99.255")
+ data = packer.pack(record_in)
+ record_out = packer.unpack(data)
+ assert record_in == record_out
+
+ # ip should be encoded as dword/bytes
+ assert b"\x0a\x16\x63\xff" in data
+
+
+def test_pack_ipnetwork():
+ packer = RecordPacker()
+
+ TestRecord = RecordDescriptor("test/ipnetwork", [
+ ("net.ipnetwork", "subnet"),
+ ])
+
+ record_in = TestRecord("172.16.0.0/16")
+ data = packer.pack(record_in)
+ record_out = packer.unpack(data)
+ assert record_in == record_out
+
+ # subnet should be encoded as string
+ assert b"172.16.0.0/16" in data
diff --git a/tests/test_fieldtypes.py b/tests/test_fieldtypes.py
new file mode 100644
index 0000000..2854b45
--- /dev/null
+++ b/tests/test_fieldtypes.py
@@ -0,0 +1,458 @@
+# coding: utf-8
+
+import pytest
+import datetime
+import hashlib
+
+from flow.record import RecordDescriptor
+from flow.record.fieldtypes import net
+from flow.record.fieldtypes import uri
+from flow.record.fieldtypes import fieldtype_for_value
+import flow.record.fieldtypes
+
+INT64_MAX = (1 << 63) - 1
+INT32_MAX = (1 << 31) - 1
+INT16_MAX = (1 << 15) - 1
+
+UINT128_MAX = (1 << 128) - 1
+UINT64_MAX = (1 << 64) - 1
+UINT32_MAX = (1 << 32) - 1
+UINT16_MAX = (1 << 16) - 1
+
+
+def test_uint16():
+ desc = RecordDescriptor("test/uint16", [
+ ("uint16", "value"),
+ ])
+
+ # valid
+ desc.recordType(0x0)
+ desc.recordType(0x1)
+ desc.recordType(UINT16_MAX)
+
+ # invalid
+ with pytest.raises(ValueError):
+ desc.recordType(-1)
+
+ with pytest.raises(ValueError):
+ desc.recordType(UINT16_MAX + 1)
+
+ with pytest.raises((ValueError, OverflowError)):
+ desc.recordType(UINT128_MAX)
+
+
+def test_uint32():
+ TestRecord = RecordDescriptor("test/uint32", [
+ ("uint32", "value"),
+ ])
+
+ # valid
+ TestRecord(0x0)
+ TestRecord(0x1)
+ TestRecord(UINT16_MAX)
+ TestRecord(UINT32_MAX)
+
+ # invalid
+ with pytest.raises(ValueError):
+ TestRecord(-1)
+
+ with pytest.raises(ValueError):
+ TestRecord(UINT32_MAX + 1)
+
+ with pytest.raises((ValueError, OverflowError)):
+ TestRecord(UINT128_MAX)
+
+
+def test_net_ipv4_address():
+ TestRecord = RecordDescriptor("test/net/ipv4/address", [
+ ("net.ipv4.Address", "ip"),
+ ])
+
+ TestRecord("1.1.1.1")
+ TestRecord("0.0.0.0")
+ TestRecord("192.168.0.1")
+ TestRecord("255.255.255.255")
+
+ r = TestRecord(u"127.0.0.1")
+
+ assert isinstance(r.ip, net.ipv4.Address)
+
+ for invalid in ["1.1.1.256", "192.168.0.1/24", "a.b.c.d"]:
+ with pytest.raises(Exception) as excinfo:
+ TestRecord(invalid)
+ excinfo.match(r'.*illegal IP address string.*')
+
+ r = TestRecord()
+ assert r.ip is None
+
+
+def test_net_ipv4_subnet():
+ TestRecord = RecordDescriptor("test/net/ipv4/subnet", [
+ ("net.ipv4.Subnet", "subnet"),
+ ])
+
+ r = TestRecord("1.1.1.0/24")
+ assert str(r.subnet) == "1.1.1.0/24"
+
+ assert "1.1.1.1" in r.subnet
+ assert "1.1.1.2" in r.subnet
+
+ assert "1.1.2.1" not in r.subnet
+ # assert "1.1.1.1/32" not in r.subnet
+
+ r = TestRecord("0.0.0.0")
+ r = TestRecord("192.168.0.1")
+ r = TestRecord("255.255.255.255")
+
+ r = TestRecord(u"127.0.0.1")
+
+ for invalid in ["a.b.c.d", "foo", "bar", ""]:
+ with pytest.raises(Exception) as excinfo:
+ TestRecord(invalid)
+ excinfo.match(r'.*illegal IP address string.*')
+
+ for invalid in [1, 1.0, sum, dict(), list(), True]:
+ with pytest.raises(TypeError) as excinfo:
+ TestRecord(invalid)
+ excinfo.match(r'Subnet\(\) argument 1 must be string, not .*')
+
+ with pytest.raises(ValueError) as excinfo:
+ TestRecord("192.168.0.106/28")
+ excinfo.match(r"Not a valid subnet '192\.168\.0\.106/28', did you mean '192\.168\.0\.96/28' ?")
+
+
+def test_bytes():
+ TestRecord = RecordDescriptor("test/string", [
+ ("string", "url"),
+ ("bytes", "body"),
+ ])
+
+ r = TestRecord("url", b"some bytes")
+ assert r.body == b"some bytes"
+
+ with pytest.raises(TypeError) as excinfo:
+ r = TestRecord("url", 1234)
+ excinfo.match(r"Value not of bytes type")
+
+ with pytest.raises(TypeError) as excinfo:
+ r = TestRecord("url", u"a string")
+ excinfo.match(r"Value not of bytes type")
+
+ b_array = bytes(bytearray(range(256)))
+ body = b"HTTP/1.1 200 OK\r\n\r\n" + b_array
+ r = TestRecord("http://www.fox-it.com", body)
+ assert r
+ assert r.url == u"http://www.fox-it.com"
+ assert r.body == b"HTTP/1.1 200 OK\r\n\r\n" + b_array
+
+ # testcase when input are bytes
+ r = TestRecord("http://www.fox-it.com", b'HTTP/1.1 500 Error\r\n\r\nError')
+ assert r.body == b"HTTP/1.1 500 Error\r\n\r\nError"
+
+
+def test_string():
+ TestRecord = RecordDescriptor("test/string", [
+ ("string", "name"),
+ ])
+
+ r = TestRecord("Fox-IT")
+ assert r.name == u"Fox-IT"
+
+ r = TestRecord(u"Rémy")
+ assert r.name == u"Rémy"
+
+ # construct from 'bytes'
+ r = TestRecord(b'R\xc3\xa9my')
+ assert r.name == u"Rémy"
+
+ # construct from 'bytes' but with invalid unicode bytes
+ if isinstance(u'', str):
+ # Python 3
+ with pytest.raises(UnicodeDecodeError):
+ TestRecord(b'R\xc3\xa9\xeamy')
+ else:
+ # Python 2
+ with pytest.warns(RuntimeWarning):
+ r = TestRecord(b'R\xc3\xa9\xeamy')
+ assert r.name
+
+
+def test_wstring():
+ # Behaves the same as test/string, only available for backwards compatibility purposes
+ TestRecord = RecordDescriptor("test/wstring", [
+ ("wstring", "name"),
+ ])
+
+ r = TestRecord("Fox-IT")
+ assert r.name == u"Fox-IT"
+
+
+def test_typedlist():
+ TestRecord = RecordDescriptor("test/typedlist", [
+ ("string[]", "string_value"),
+ ("uint32[]", "uint32_value"),
+ ("uri[]", "uri_value"),
+ ])
+
+ r = TestRecord(['a', 'b', 'c'], [1, 2, 3], ["/etc/passwd", "/etc/shadow"])
+ assert len(r.string_value) == 3
+ assert len(r.uint32_value) == 3
+ assert len(r.uri_value) == 2
+ assert r.string_value[2] == 'c'
+ assert r.uint32_value[1] == 2
+ assert all([isinstance(v, uri) for v in r.uri_value])
+ assert r.uri_value[1].filename == 'shadow'
+
+ r = TestRecord()
+ assert r.string_value == []
+ assert r.uint32_value == []
+ assert r.uri_value == []
+
+ with pytest.raises(ValueError):
+ r = TestRecord(uint32_value=['a', 'b', 'c'])
+
+
+def test_stringlist():
+ TestRecord = RecordDescriptor("test/string", [
+ ("stringlist", "value"),
+ ])
+
+ r = TestRecord(['a', 'b', 'c'])
+ assert len(r.value) == 3
+ assert r.value[2] == 'c'
+
+ r = TestRecord([u"Rémy"])
+ assert r.value[0]
+
+
+def test_dictlist():
+ TestRecord = RecordDescriptor("test/dictlist", [
+ ("dictlist", "hits"),
+ ])
+
+ r = TestRecord([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
+ assert len(r.hits) == 2
+ assert r.hits == [{"a": 1, "b": 2}, {"a": 3, "b": 4}]
+ assert r.hits[0]["a"] == 1
+ assert r.hits[0]["b"] == 2
+ assert r.hits[1]["a"] == 3
+ assert r.hits[1]["b"] == 4
+
+
+def test_boolean():
+ TestRecord = RecordDescriptor("test/boolean", [
+ ("boolean", "booltrue"),
+ ("boolean", "boolfalse"),
+ ])
+
+ r = TestRecord(True, False)
+ assert bool(r.booltrue) is True
+ assert bool(r.boolfalse) is False
+
+ r = TestRecord(1, 0)
+ assert bool(r.booltrue) is True
+ assert bool(r.boolfalse) is False
+
+ assert str(r.booltrue) == "True"
+ assert str(r.boolfalse) == "False"
+
+ assert repr(r.booltrue) == "True"
+ assert repr(r.boolfalse) == "False"
+
+ with pytest.raises(ValueError):
+ r = TestRecord(2, -1)
+
+ with pytest.raises(ValueError):
+ r = TestRecord('True', 'False')
+
+
+def test_float():
+ TestRecord = RecordDescriptor("test/float", [
+ ("float", "value"),
+ ])
+
+ # initialize via float
+ r = TestRecord(1.3337)
+ assert r.value == 1.3337
+
+ # initialize via string
+ r = TestRecord("1.3337")
+ assert r.value == 1.3337
+
+ # initialize via int
+ r = TestRecord("1337")
+ assert r.value == 1337.0
+
+ # negative float
+ r = TestRecord(-12345)
+ assert r.value == -12345
+
+ # invalid float
+ with pytest.raises(ValueError):
+ r = TestRecord("abc")
+
+
+def test_uri_type():
+ TestRecord = RecordDescriptor("test/uri", [
+ ("uri", "path"),
+ ])
+
+ r = TestRecord("http://www.google.com/a.bin")
+ assert r.path.filename == "a.bin"
+ assert r.path.dirname == "/"
+ assert r.path.hostname == "www.google.com"
+ assert r.path.protocol == "http"
+ assert r.path.protocol == r.path.scheme
+ assert r.path.path == "/a.bin"
+
+ r = TestRecord("http://username:password@example.com/path/file.txt?query=1")
+ assert r.path.filename == "file.txt"
+ assert r.path.dirname == "/path"
+ assert r.path.args == "query=1"
+ assert r.path.username == "username"
+ assert r.path.password == "password"
+ assert r.path.protocol == "http"
+ assert r.path.hostname == "example.com"
+
+ r = TestRecord(uri.from_windows(r"c:\windows\program files\Fox-IT B.V\flow.exe"))
+ assert r.path.filename == "flow.exe"
+
+ r = TestRecord()
+ r.path = uri.normalize(r"c:\Users\Fox-IT\Downloads\autoruns.exe")
+ assert r.path.filename == "autoruns.exe"
+ assert r.path.dirname == uri.normalize(r"\Users\Fox-IT\Downloads")
+ assert r.path.dirname == "/Users/Fox-IT/Downloads"
+
+ r = TestRecord()
+ r.path = "/usr/local/bin/sshd"
+ assert r.path.filename == "sshd"
+ assert r.path.dirname == "/usr/local/bin"
+
+
+def test_datetime():
+ TestRecord = RecordDescriptor("test/datetime", [
+ ("datetime", "ts"),
+ ])
+
+ now = datetime.datetime.utcnow()
+ r = TestRecord(now)
+ assert r.ts == now
+
+ r = TestRecord(u"2018-03-22T15:15:23")
+ assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23)
+
+ r = TestRecord(u"2018-03-22T15:15:23.000000")
+ assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23)
+
+ r = TestRecord(u"2018-03-22T15:15:23.123456")
+ assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23, 123456)
+
+ dt = datetime.datetime(2018, 3, 22, 15, 15, 23, 123456)
+ dt_str = dt.isoformat()
+ r = TestRecord(dt_str)
+ assert r.ts == dt
+
+ r = TestRecord(1521731723)
+ assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23)
+
+
+def test_digest():
+ TestRecord = RecordDescriptor("test/digest", [
+ ("digest", "digest"),
+ ])
+
+ md5 = hashlib.md5(b"hello").hexdigest()
+ sha1 = hashlib.sha1(b"hello").hexdigest()
+ sha256 = hashlib.sha256(b"hello").hexdigest()
+
+ record = TestRecord()
+ assert isinstance(record.digest, flow.record.fieldtypes.digest)
+
+ record = TestRecord((md5, sha1, sha256))
+ assert record.digest.md5 == "5d41402abc4b2a76b9719d911017c592"
+ assert record.digest.sha1 == "aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d"
+ assert record.digest.sha256 == "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824"
+
+ record = TestRecord(("5d41402abc4b2a76b9719d911017c592", None, None))
+ assert record.digest.md5 == "5d41402abc4b2a76b9719d911017c592"
+ assert record.digest.sha1 is None
+ assert record.digest.sha256 is None
+
+ record = TestRecord()
+ record.digest = (md5, sha1, sha256)
+ assert record.digest.md5 == md5
+ assert record.digest.sha1 == sha1
+ assert record.digest.sha256 == sha256
+
+ with pytest.raises(TypeError) as excinfo:
+ record = TestRecord(("a", sha1, sha256))
+ excinfo.match(r'.*Invalid MD5.*Odd-length string')
+
+ with pytest.raises(TypeError) as excinfo:
+ record = TestRecord(("aa", sha1, sha256))
+ excinfo.match(r'.*Invalid MD5.*Incorrect hash length')
+
+ with pytest.raises(TypeError) as excinfo:
+ record = TestRecord((md5, "aa", sha256))
+ excinfo.match(r'.*Invalid SHA1.*')
+
+ with pytest.raises(TypeError) as excinfo:
+ record = TestRecord((md5, sha1, "aa"))
+ excinfo.match(r'.*Invalid SHA256.*')
+
+ record = TestRecord()
+ assert record.digest is not None
+ assert record.digest.md5 is None
+ assert record.digest.sha1 is None
+ assert record.digest.sha256 is None
+ with pytest.raises(TypeError) as excinfo:
+ record.digest.md5 = "INVALID MD5"
+ excinfo.match(r'.*Invalid MD5.*')
+
+
+def test_dynamic():
+ TestRecord = RecordDescriptor("test/dynamic", [
+ ("dynamic", "value"),
+ ])
+
+ r = TestRecord(b"bytes")
+ assert r.value == b"bytes"
+ assert isinstance(r.value, flow.record.fieldtypes.bytes)
+
+ r = TestRecord(u"string")
+ assert r.value == u"string"
+ assert isinstance(r.value, flow.record.fieldtypes.string)
+
+ r = TestRecord(123)
+ assert r.value == 123
+ assert isinstance(r.value, flow.record.fieldtypes.varint)
+
+ r = TestRecord(True)
+ assert r.value
+ assert isinstance(r.value, flow.record.fieldtypes.boolean)
+
+ r = TestRecord([1, 2, 3])
+ assert r.value == [1, 2, 3]
+ assert isinstance(r.value, flow.record.fieldtypes.stringlist)
+
+ now = datetime.datetime.utcnow()
+ r = TestRecord(now)
+ assert r.value == now
+ assert isinstance(r.value, flow.record.fieldtypes.datetime)
+
+
+def test_fieldtype_for_value():
+ assert fieldtype_for_value(True) == "boolean"
+ assert fieldtype_for_value(False) == "boolean"
+ assert fieldtype_for_value(1337) == "varint"
+ assert fieldtype_for_value(1.337) == "float"
+ assert fieldtype_for_value(b"\r\n") == "bytes"
+ assert fieldtype_for_value("hello world") == "string"
+ assert fieldtype_for_value(datetime.datetime.now()) == "datetime"
+ assert fieldtype_for_value([1, 2, 3, 4, 5]) == "string"
+ assert fieldtype_for_value([1, 2, 3, 4, 5], None) is None
+ assert fieldtype_for_value(object(), None) is None
+
+
+if __name__ == "__main__":
+ __import__("standalone_test").main(globals())
diff --git a/tests/test_json_packer.py b/tests/test_json_packer.py
new file mode 100644
index 0000000..cfce228
--- /dev/null
+++ b/tests/test_json_packer.py
@@ -0,0 +1,25 @@
+from __future__ import print_function
+from datetime import datetime
+from flow.record import JsonRecordPacker, RecordDescriptor
+
+
+def test_record_in_record():
+ packer = JsonRecordPacker()
+ dt = datetime.utcnow()
+
+ RecordA = RecordDescriptor("test/record_a", [
+ ("datetime", "some_dt"),
+ ])
+ RecordB = RecordDescriptor("test/record_b", [
+ ("record", "record"),
+ ("datetime", "some_dt"),
+ ])
+
+ record_a = RecordA(dt)
+ record_b = RecordB(record_a, dt)
+
+ data_record_b = packer.pack(record_b)
+ record_b_unpacked = packer.unpack(data_record_b)
+
+ assert record_b == record_b_unpacked
+ assert record_a == record_b_unpacked.record
diff --git a/tests/test_json_record_adapter.py b/tests/test_json_record_adapter.py
new file mode 100644
index 0000000..2b6b11a
--- /dev/null
+++ b/tests/test_json_record_adapter.py
@@ -0,0 +1,71 @@
+import json
+import datetime
+from flow.record import RecordDescriptor, RecordWriter, RecordReader
+
+
+def generate_records(count=100):
+ TestRecordEmbedded = RecordDescriptor("test/embedded_record", [
+ ("datetime", "dt"),
+ ])
+ TestRecord = RecordDescriptor("test/adapter", [
+ ("uint32", "number"),
+ ("record", "record"),
+ ])
+
+ for i in range(count):
+ embedded = TestRecordEmbedded(datetime.datetime.utcnow())
+ yield TestRecord(number=i, record=embedded)
+
+
+def test_json_adapter(tmpdir):
+ json_file = tmpdir.join("records.json")
+ record_adapter_path = "jsonfile://{}".format(json_file)
+ writer = RecordWriter(record_adapter_path)
+ nr_records = 1337
+
+ for record in generate_records(nr_records):
+ writer.write(record)
+ writer.flush()
+
+ nr_received_records = 0
+ reader = RecordReader(record_adapter_path)
+ for record in reader:
+ nr_received_records += 1
+
+ assert nr_records == nr_received_records
+
+
+def test_json_adapter_contextmanager(tmpdir):
+ json_file = tmpdir.join("records.json")
+ record_adapter_path = "jsonfile://{}".format(json_file)
+ with RecordWriter(record_adapter_path) as writer:
+ nr_records = 1337
+ for record in generate_records(nr_records):
+ writer.write(record)
+
+ nr_received_records = 0
+ with RecordReader(record_adapter_path) as reader:
+ for record in reader:
+ nr_received_records += 1
+
+ assert nr_records == nr_received_records
+
+
+def test_json_adapter_jsonlines(tmpdir):
+ json_file = tmpdir.join("data.jsonl")
+
+ items = [
+ {'some_float': 1.5, 'some_string': 'hello world', 'some_int': 1337, 'some_bool': True},
+ {'some_float': 2.7, 'some_string': 'goodbye world', 'some_int': 12345, 'some_bool': False},
+ ]
+ with open(json_file, "w") as fout:
+ for row in items:
+ fout.write(json.dumps(row) + "\n")
+
+ record_adapter_path = "jsonfile://{}".format(json_file)
+ reader = RecordReader(record_adapter_path)
+ for index, record in enumerate(reader):
+ assert record.some_float == items[index]["some_float"]
+ assert record.some_string == items[index]["some_string"]
+ assert record.some_int == items[index]["some_int"]
+ assert record.some_bool == items[index]["some_bool"]
diff --git a/tests/test_packer.py b/tests/test_packer.py
new file mode 100644
index 0000000..4c5ffb2
--- /dev/null
+++ b/tests/test_packer.py
@@ -0,0 +1,216 @@
+import datetime
+
+from flow.record import fieldtypes
+from flow.record import RecordDescriptor
+from flow.record import RecordPacker
+from flow.record.packer import RECORD_PACK_EXT_TYPE
+from flow.record.fieldtypes import uri
+
+
+def test_uri_packing():
+ packer = RecordPacker()
+
+ TestRecord = RecordDescriptor("test/uri", [
+ ("uri", "path"),
+ ])
+
+ # construct with an url
+ record = TestRecord("http://www.google.com/evil.bin")
+ data = packer.pack(record)
+ record = packer.unpack(data)
+ assert record.path == "http://www.google.com/evil.bin"
+ assert record.path.filename == "evil.bin"
+ assert record.path.dirname == "/"
+
+ # construct from uri() -> for windows=True
+ path = uri.from_windows(r"c:\Program Files\Fox-IT\flow is awesome.exe")
+ record = TestRecord(path)
+ data = packer.pack(record)
+ record = packer.unpack(data)
+ assert record.path == "c:/Program Files/Fox-IT/flow is awesome.exe"
+ assert record.path.filename == "flow is awesome.exe"
+ assert record.path.dirname == "/Program Files/Fox-IT"
+
+ # construct using uri.from_windows()
+ path = uri.from_windows(r"c:\Users\Hello World\foo.bar.exe")
+ record = TestRecord(path)
+ data = packer.pack(record)
+ record = packer.unpack(data)
+ assert record.path == "c:/Users/Hello World/foo.bar.exe"
+ assert record.path.filename == "foo.bar.exe"
+ assert record.path.dirname == "/Users/Hello World"
+
+
+def test_typedlist_packer():
+ packer = RecordPacker()
+ TestRecord = RecordDescriptor("test/typedlist", [
+ ("string[]", "string_value"),
+ ("uint32[]", "uint32_value"),
+ ("uri[]", "uri_value"),
+ ])
+
+ r1 = TestRecord(['a', 'b', 'c'], [1, 2, 3], ["/etc/passwd", "/etc/shadow"])
+ data = packer.pack(r1)
+ r2 = packer.unpack(data)
+
+ assert len(r1.string_value) == 3
+ assert len(r1.uint32_value) == 3
+ assert len(r1.uri_value) == 2
+ assert r1.string_value[2] == 'c'
+ assert r1.uint32_value[1] == 2
+ assert all([isinstance(v, uri) for v in r1.uri_value])
+ assert r1.uri_value[1].filename == 'shadow'
+
+ assert len(r2.string_value) == 3
+ assert len(r2.uint32_value) == 3
+ assert len(r2.uri_value) == 2
+ assert r2.string_value[2] == 'c'
+ assert r2.uint32_value[1] == 2
+ assert all([isinstance(v, uri) for v in r2.uri_value])
+ assert r2.uri_value[1].filename == 'shadow'
+
+
+def test_dictlist_packer():
+ packer = RecordPacker()
+ TestRecord = RecordDescriptor("test/dictlist", [
+ ("dictlist", "hits"),
+ ])
+
+ r1 = TestRecord([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
+ data = packer.pack(r1)
+ r2 = packer.unpack(data)
+
+ assert len(r1.hits) == 2
+ assert r1.hits == [{"a": 1, "b": 2}, {"a": 3, "b": 4}]
+ assert r1.hits[0]["a"] == 1
+ assert r1.hits[0]["b"] == 2
+ assert r1.hits[1]["a"] == 3
+ assert r1.hits[1]["b"] == 4
+
+ assert len(r2.hits) == 2
+ assert r2.hits == [{"a": 1, "b": 2}, {"a": 3, "b": 4}]
+ assert r2.hits[0]["a"] == 1
+ assert r2.hits[0]["b"] == 2
+ assert r2.hits[1]["a"] == 3
+ assert r2.hits[1]["b"] == 4
+
+
+def test_dynamic_packer():
+ packer = RecordPacker()
+ TestRecord = RecordDescriptor("test/dynamic", [
+ ("dynamic", "value"),
+ ])
+
+ t = TestRecord(123)
+ data = packer.pack(t)
+ r = packer.unpack(data)
+
+ assert r.value == 123
+ assert isinstance(r.value, fieldtypes.varint)
+
+ t = TestRecord(b"bytes")
+ data = packer.pack(t)
+ r = packer.unpack(data)
+
+ assert r.value == b"bytes"
+ assert isinstance(r.value, fieldtypes.bytes)
+
+ t = TestRecord(u"string")
+ data = packer.pack(t)
+ r = packer.unpack(data)
+
+ assert r.value == u"string"
+ assert isinstance(r.value, fieldtypes.string)
+
+ t = TestRecord(True)
+ data = packer.pack(t)
+ r = packer.unpack(data)
+
+ assert r.value
+ assert isinstance(r.value, fieldtypes.boolean)
+
+ t = TestRecord([1, True, b"b", u"u"])
+ data = packer.pack(t)
+ r = packer.unpack(data)
+
+ assert r.value == [1, True, b"b", u"u"]
+ assert isinstance(r.value, fieldtypes.stringlist)
+
+ now = datetime.datetime.utcnow()
+ t = TestRecord(now)
+ data = packer.pack(t)
+ r = packer.unpack(data)
+
+ assert r.value == now
+ assert isinstance(r.value, fieldtypes.datetime)
+
+
+def test_pack_record_desc():
+ packer = RecordPacker()
+ TestRecord = RecordDescriptor("test/pack", [
+ ("string", "a"),
+ ])
+ ext_type = packer.pack_obj(TestRecord)
+ assert ext_type.code == RECORD_PACK_EXT_TYPE
+ assert ext_type.data == b"\x92\x02\x92\xa9test/pack\x91\x92\xa6string\xa1a"
+ desc = packer.unpack_obj(ext_type.code, ext_type.data)
+ assert desc.name == TestRecord.name
+ assert desc.fields.keys() == TestRecord.fields.keys()
+ assert desc._pack() == TestRecord._pack()
+
+
+def test_pack_digest():
+ packer = RecordPacker()
+ TestRecord = RecordDescriptor("test/digest", [
+ ("digest", "digest"),
+ ])
+ record = TestRecord(("d41d8cd98f00b204e9800998ecf8427e", None, None))
+ data = packer.pack(record)
+ record = packer.unpack(data)
+ assert record.digest.md5 == "d41d8cd98f00b204e9800998ecf8427e"
+ assert record.digest.sha1 is None
+ assert record.digest.sha256 is None
+
+
+def test_record_in_record():
+ packer = RecordPacker()
+ dt = datetime.datetime.utcnow()
+
+ RecordA = RecordDescriptor("test/record_a", [
+ ("datetime", "some_dt"),
+ ])
+ RecordB = RecordDescriptor("test/record_b", [
+ ("record", "record"),
+ ("datetime", "some_dt"),
+ ])
+
+ record_a = RecordA(dt)
+ record_b = RecordB(record_a, dt)
+
+ data_record_b = packer.pack(record_b)
+ record_b_unpacked = packer.unpack(data_record_b)
+
+ assert record_b == record_b_unpacked
+ assert record_a == record_b_unpacked.record
+
+
+def test_record_array():
+ packer = RecordPacker()
+
+ EmbeddedRecord = RecordDescriptor("test/record_a", [
+ ("string", "some_field"),
+ ])
+ ParentRecord = RecordDescriptor("test/record_b", [
+ ("record[]", "subrecords"),
+ ])
+
+ parent = ParentRecord()
+ for i in range(3):
+ emb_record = EmbeddedRecord(
+ some_field="embedded record {}".format(i))
+ parent.subrecords.append(emb_record)
+
+ data_record_parent = packer.pack(parent)
+ parent_unpacked = packer.unpack(data_record_parent)
+
+ assert parent == parent_unpacked
diff --git a/tests/test_rdump.py b/tests/test_rdump.py
new file mode 100644
index 0000000..b941b18
--- /dev/null
+++ b/tests/test_rdump.py
@@ -0,0 +1,178 @@
+import json
+import base64
+import hashlib
+import subprocess
+
+from flow.record import RecordDescriptor
+from flow.record import RecordWriter, RecordReader
+
+
+def test_rdump_pipe(tmp_path):
+ TestRecord = RecordDescriptor("test/record", [
+ ("varint", "count"),
+ ("string", "foo"),
+ ])
+
+ path = tmp_path / "test.records"
+ writer = RecordWriter(path)
+
+ for i in range(10):
+ writer.write(TestRecord(count=i, foo="bar"))
+ writer.close()
+
+ # validate input
+ args = ["rdump", str(path)]
+ res = subprocess.Popen(args, stdout=subprocess.PIPE)
+ stdout, stderr = res.communicate()
+ assert len(stdout.splitlines()) == 10
+
+ # rdump test.records | wc -l
+ p1 = subprocess.Popen(["rdump", str(path)], stdout=subprocess.PIPE)
+ p2 = subprocess.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE)
+ stdout, stderr = p2.communicate()
+ assert stdout.strip() == b"10"
+
+ # (binary) rdump test.records -w - | rdump -s 'r.count == 5'
+ p1 = subprocess.Popen(["rdump", str(path), "-w", "-"], stdout=subprocess.PIPE)
+ p2 = subprocess.Popen(
+ ["rdump", "-s", "r.count == 5"], stdin=p1.stdout, stdout=subprocess.PIPE,
+ )
+ stdout, stderr = p2.communicate()
+ assert stdout.strip() in (b"", b"")
+
+ # (printer) rdump test.records | rdump -s 'r.count == 5'
+ p1 = subprocess.Popen(["rdump", str(path)], stdout=subprocess.PIPE)
+ p2 = subprocess.Popen(
+ ["rdump", "-s", "r.count == 5"],
+ stdin=p1.stdout,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ stdout, stderr = p2.communicate()
+ assert stdout.strip() == b""
+ assert b"Unknown file format, not a RecordStream" in stderr.strip()
+
+ # rdump test.records -w - | rdump -s 'r.count in (1, 3, 9)' -w filtered.records
+ path2 = tmp_path / "filtered.records"
+ p1 = subprocess.Popen(["rdump", str(path), "-w", "-"], stdout=subprocess.PIPE)
+ p2 = subprocess.Popen(
+ ["rdump", "-s", "r.count in (1, 3, 9)", "-w", str(path2)], stdin=p1.stdout,
+ )
+ stdout, stderr = p2.communicate()
+
+ reader = RecordReader(path2)
+ records = list(reader)
+ assert len(records) == 3
+ assert {r.count for r in records} == {1, 3, 9}
+
+
+def test_rdump_format_template(tmp_path):
+ TestRecord = RecordDescriptor("test/record", [
+ ("varint", "count"),
+ ("string", "foo"),
+ ])
+
+ path = tmp_path / "test.records"
+ writer = RecordWriter(path)
+
+ # generate some test records
+ for i in range(10):
+ writer.write(TestRecord(count=i, foo="bar"))
+ writer.close()
+
+ # validate output with -f
+ args = ["rdump", str(path), "-f", "TEST: {count},{foo}"]
+ print(args)
+ res = subprocess.Popen(args, stdout=subprocess.PIPE)
+ stdout, stderr = res.communicate()
+ for i, line in enumerate(stdout.decode().splitlines()):
+ assert line == "TEST: {i},bar".format(i=i)
+
+
+def test_rdump_json(tmp_path):
+ TestRecord = RecordDescriptor("test/record", [
+ ("varint", "count"),
+ ("string", "foo"),
+ ("bytes", "data"),
+ ("net.ipaddress", "ip"),
+ ("net.ipnetwork", "subnet"),
+ ("digest", "digest"),
+ ])
+
+ record_path = tmp_path / "test.records"
+ writer = RecordWriter(record_path)
+
+ # generate some test records
+ for i in range(10):
+ data = str(i).encode()
+ md5 = hashlib.md5(data).hexdigest()
+ sha1 = hashlib.sha1(data).hexdigest()
+ sha256 = hashlib.sha256(data).hexdigest()
+ writer.write(
+ TestRecord(
+ count=i,
+ foo="bar" * i,
+ data=b"\x00\x01\x02\x03--" + data,
+ ip=u"172.16.0.{}".format(i),
+ subnet=u"192.168.{}.0/24".format(i),
+ digest=(md5, sha1, sha256),
+ ))
+ writer.close()
+
+ # dump records as JSON lines
+ args = ["rdump", str(record_path), "--jsonlines"]
+ process = subprocess.Popen(args, stdout=subprocess.PIPE)
+ stdout, stderr = process.communicate()
+
+ assert process.returncode == 0
+
+ # Basic validations in stdout
+ for i in range(10):
+ assert base64.b64encode("\x00\x01\x02\x03--{}".format(i).encode()) in stdout
+ assert u"192.168.{}.0/24".format(i).encode() in stdout
+ assert u"172.16.0.{}".format(i).encode() in stdout
+ assert ("bar" * i).encode() in stdout
+
+ # Load json using json.loads() and validate key values
+ for i, line in enumerate(stdout.splitlines()):
+ json_dict = json.loads(line)
+ assert json_dict
+ if i == 0:
+ assert "_type" in json_dict
+ assert json_dict["_type"] == "recorddescriptor"
+ else:
+ count = i - 1 # fix offset as first line is the recorddescriptor information
+ data = str(count).encode()
+ md5 = hashlib.md5(data).hexdigest()
+ sha1 = hashlib.sha1(data).hexdigest()
+ sha256 = hashlib.sha256(data).hexdigest()
+ assert json_dict["count"] == count
+ assert json_dict["foo"] == "bar" * count
+ assert json_dict["data"] == base64.b64encode("\x00\x01\x02\x03--{}".format(count).encode()).decode()
+ assert json_dict["ip"] == u"172.16.0.{}".format(count)
+ assert json_dict["subnet"] == u"192.168.{}.0/24".format(count)
+ assert json_dict["digest"]["md5"] == md5
+ assert json_dict["digest"]["sha1"] == sha1
+ assert json_dict["digest"]["sha256"] == sha256
+
+ # Write jsonlines to file
+ path = tmp_path / "records.jsonl"
+ path.write_bytes(stdout)
+ json_path = "jsonfile://{}".format(path)
+
+ # Read records from json and original records file and validate
+ for path in (json_path, record_path):
+ with RecordReader(path) as reader:
+ for i, record in enumerate(reader):
+ data = str(i).encode()
+ md5 = hashlib.md5(data).hexdigest()
+ sha1 = hashlib.sha1(data).hexdigest()
+ sha256 = hashlib.sha256(data).hexdigest()
+ assert record.count == i
+ assert record.ip == u"172.16.0.{}".format(i)
+ assert record.subnet == u"192.168.{}.0/24".format(i)
+ assert record.data == b"\x00\x01\x02\x03--" + data
+ assert record.digest.md5 == md5
+ assert record.digest.sha1 == sha1
+ assert record.digest.sha256 == sha256
+ assert record.foo == "bar" * i
diff --git a/tests/test_record.py b/tests/test_record.py
new file mode 100644
index 0000000..d22a100
--- /dev/null
+++ b/tests/test_record.py
@@ -0,0 +1,613 @@
+import sys
+import pytest
+from flow.record import RECORD_VERSION
+from flow.record import RecordDescriptor, RecordDescriptorError
+from flow.record import RecordPacker
+from flow.record import RecordWriter, RecordReader, RecordPrinter
+from flow.record import Record, GroupedRecord
+from flow.record import record_stream, extend_record
+from flow.record import fieldtypes
+from flow.record.stream import RecordFieldRewriter
+
+from . import utils_inspect as inspect
+
+
+def test_record_creation():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "url"),
+ ("string", "query"),
+ ])
+
+ # No arguments defaults to None
+ r = TestRecord()
+ assert r.url is None
+ assert r.query is None
+
+ # Keyword arguments
+ r = TestRecord(url="foo", query="bar")
+ assert r.url == "foo"
+ assert r.query == "bar"
+
+ # Positional arguments
+ r = TestRecord("foo", "bar")
+ assert r.url == "foo"
+ assert r.query == "bar"
+
+ # Single keyword argument
+ r = TestRecord(query="foo")
+ assert r.query == "foo"
+ assert r.url is None
+
+
+def test_record_version(tmpdir):
+ path = "jsonfile://{}".format(tmpdir.join("test.jsonl").strpath)
+ writer = RecordWriter(path)
+ packer = RecordPacker()
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "hello"),
+ ("string", "world"),
+ ])
+
+ r1 = TestRecord(hello="hello", world="world")
+ writer.write(r1)
+ data = packer.pack(r1)
+ u1 = packer.unpack(data)
+ print(repr(u1._desc))
+
+ assert u1.hello == r1.hello
+ assert u1.world == r1.world
+
+ # change the order
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "world"),
+ ("string", "hello"),
+ ])
+ r2 = TestRecord(hello="hello", world="world")
+ writer.write(r2)
+ data = packer.pack(r2)
+ u2 = packer.unpack(data)
+
+ assert u2.hello == r2.hello
+ assert u2.world == r2.world
+ print(repr(u2._desc))
+
+ # change fieldtypes
+ TestRecord = RecordDescriptor("test/record", [
+ ("varint", "world"),
+ ("string", "hello"),
+ ])
+ r3 = TestRecord(hello="hello", world=42)
+ writer.write(r3)
+ data = packer.pack(r3)
+ u3 = packer.unpack(data)
+
+ writer.flush()
+
+ assert u3._desc.identifier == r3._desc.identifier
+ assert u1._desc.identifier != u3._desc.identifier
+ assert u2._desc.identifier != u3._desc.identifier
+ assert u3.hello == r3.hello
+ assert u3.world == r3.world
+
+ reader = RecordReader(path)
+ rec = [r for r in reader]
+ assert len(rec) == 3
+ assert u3._desc.identifier == rec[2]._desc.identifier
+ assert u1._desc.identifier != rec[2]._desc.identifier
+ assert u2._desc.identifier != rec[2]._desc.identifier
+ assert u3.hello == rec[2].hello
+ assert u3.world == rec[2].world
+
+
+def test_grouped_record():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "hello"),
+ ("string", "world"),
+ ("uint32", "count"),
+ ])
+ WQMetaRecord = RecordDescriptor("wq/meta", [
+ ("string", "assignee"),
+ ("string", "profile"),
+ ("string", "hello"),
+ ])
+
+ test_record = TestRecord("a", "b", 12345)
+ meta_record = WQMetaRecord("me", "this is a test", "other hello")
+
+ grouped = GroupedRecord("grouped/wq", [test_record, meta_record])
+ assert grouped.hello == "a"
+ assert grouped.world == "b"
+ assert grouped.count == 12345
+ assert grouped.assignee == "me"
+ assert grouped.profile == "this is a test"
+
+ grouped.profile = "omg"
+ grouped.hello = "new value"
+ assert grouped.hello == "new value"
+ assert grouped.profile == "omg"
+ assert grouped.records[0].hello == "new value"
+ assert grouped.records[1].hello == "other hello"
+
+ grouped.records[1].hello = "testing"
+ assert grouped.hello != "testing"
+ assert grouped.hello == "new value"
+ assert grouped.records[1].hello == "testing"
+
+ assert len(grouped.records) == 2
+
+ # test grouped._asdict
+ rdict = grouped._asdict()
+ assert set(["hello", "world", "count", "assignee", "profile", "hello"]) <= set(rdict)
+
+ rdict = grouped._asdict(fields=["profile", "count", "_generated"])
+ assert set(["profile", "count", "_generated"]) == set(rdict)
+ assert rdict["profile"] == "omg"
+ assert rdict["count"] == 12345
+
+
+def test_grouped_records_packing(tmpdir):
+ RecordA = RecordDescriptor("test/a", [
+ ("string", "a_string"),
+ ("string", "common"),
+ ("uint32", "a_count"),
+ ])
+ RecordB = RecordDescriptor("test/b", [
+ ("string", "b_string"),
+ ("string", "common"),
+ ("uint32", "b_count"),
+ ])
+ a = RecordA("hello", "world", 12345, _source="TheBadInternet", _classification="CLASSIFIED")
+ b = RecordB("good", "bye", 54321, _source="TheGoodInternet", _classification="TLP.WHITE")
+ assert isinstance(a, Record)
+ assert not isinstance(a, GroupedRecord)
+
+ grouped = GroupedRecord("grouped/ab", [a, b])
+ assert isinstance(grouped, (Record, GroupedRecord))
+ assert [(f.typename, f.name) for f in grouped._desc.fields.values()] == [
+ ("string", "a_string"),
+ ("string", "common"),
+ ("uint32", "a_count"),
+ ("string", "b_string"),
+ ("uint32", "b_count"),
+ ]
+
+ path = tmpdir.join("grouped.records").strpath
+ writer = RecordWriter(path)
+ writer.write(grouped)
+ writer.write(grouped)
+ writer.write(grouped)
+ writer.write(grouped)
+ writer.write(grouped)
+ writer.flush()
+
+ reader = RecordReader(path)
+ record = next(iter(reader))
+
+ # grouped record tests
+ assert isinstance(record, Record)
+ assert isinstance(record, GroupedRecord)
+ assert record.common == "world" # first 'key' has precendence
+ assert record.name == "grouped/ab"
+ assert record.a_string == "hello"
+ assert record.a_count == 12345
+ assert record.b_count == 54321
+ assert record.b_string == "good"
+ assert record._source == "TheBadInternet"
+ assert record._classification == "CLASSIFIED"
+
+ # access 'common' on second record directly
+ assert record.records[1].common == "bye"
+
+ # access raw records directly
+ assert len(record.records) == 2
+ assert record.records[0]._desc.name == "test/a"
+ assert record.records[1]._desc.name == "test/b"
+
+ # test using selectors
+ reader = RecordReader(path, selector="r.a_count == 12345")
+ assert len(list(iter(reader))) == 5
+
+ reader = RecordReader(path, selector="r.common == 'bye'")
+ assert len(list(iter(reader))) == 0
+ reader = RecordReader(path, selector="r.common == 'world'")
+ assert len(list(iter(reader))) == 5
+
+
+def test_record_reserved_fieldname():
+ with pytest.raises(RecordDescriptorError):
+ RecordDescriptor("test/a", [
+ ("string", "_classification"),
+ ("string", "_source"),
+ ("uint32", "_generated"),
+ ])
+
+
+def test_record_printer_stdout(capsys):
+ Record = RecordDescriptor("test/a", [
+ ("string", "a_string"),
+ ("string", "common"),
+ ("uint32", "a_count"),
+ ])
+ record = Record("hello", "world", 10)
+
+ # fake capsys to be a tty.
+ def isatty():
+ return True
+ capsys._capture.out.tmpfile.isatty = isatty
+
+ writer = RecordPrinter(getattr(sys.stdout, "buffer", sys.stdout))
+ writer.write(record)
+
+ out, err = capsys.readouterr()
+ modifier = '' if isinstance(u'', str) else 'u'
+ expected = "\n".format(u=modifier)
+ assert out == expected
+
+
+def test_record_field_limit():
+ count = 1337
+ fields = [('uint32', 'field_{}'.format(i)) for i in range(count)]
+ values = dict([('field_{}'.format(i), i) for i in range(count)])
+
+ Record = RecordDescriptor("test/limit", fields)
+ record = Record(**values)
+
+ for i in range(count):
+ assert getattr(record, 'field_{}'.format(i)) == i
+
+ # test kwarg init
+ record = Record(field_404=12345)
+ assert record.field_404 == 12345
+ assert record.field_0 is None
+
+ # test arg init
+ record = Record(200, 302, 404)
+ assert record.field_0 == 200
+ assert record.field_1 == 302
+ assert record.field_2 == 404
+ assert record.field_404 is None
+
+ # test arg + kwarg init
+ record = Record(200, 302, 404, field_502=502)
+ assert record.field_0 == 200
+ assert record.field_1 == 302
+ assert record.field_2 == 404
+ assert record.field_3 is None
+ assert record.field_502 == 502
+
+
+def test_record_internal_version():
+ Record = RecordDescriptor("test/a", [
+ ("string", "a_string"),
+ ("string", "common"),
+ ("uint32", "a_count"),
+ ])
+
+ record = Record("hello", "world", 10)
+ assert record._version == RECORD_VERSION
+
+ record = Record("hello", "world", 10, _version=1337)
+ assert record._version == RECORD_VERSION
+
+
+def test_record_reserved_keyword():
+ Record = RecordDescriptor("test/a", [
+ ("string", "from"),
+ ("string", "and"),
+ ("uint32", "or"),
+ ("uint32", "normal"),
+ ])
+
+ init = Record.recordType.__init__
+ sig = inspect.signature(init)
+ params = list(sig.parameters.values())
+ assert init.__code__.co_argcount == 1
+ assert len(params) == 3
+ assert params[1].name == 'args'
+ assert params[1].kind == params[1].VAR_POSITIONAL
+ assert params[2].name == 'kwargs'
+ assert params[2].kind == params[2].VAR_KEYWORD
+
+ r = Record('hello', 'world', 1337, 10)
+ assert getattr(r, 'from') == 'hello'
+ assert getattr(r, 'and') == 'world'
+ assert getattr(r, 'or') == 1337
+ assert r.normal == 10
+
+ r = Record('some', 'missing', normal=5)
+ assert getattr(r, 'from') == 'some'
+ assert getattr(r, 'and') == 'missing'
+ assert getattr(r, 'or') is None
+ assert r.normal == 5
+
+ r = Record('from_value', **{'and': 'dict', 'or': 7331, 'normal': 3})
+ assert getattr(r, 'from') == 'from_value'
+ assert getattr(r, 'and') == 'dict'
+ assert getattr(r, 'or') == 7331
+ assert r.normal == 3
+
+ Record = RecordDescriptor("test/a", [
+ ("uint32", "normal"),
+ ])
+
+ init = Record.recordType.__init__
+ sig = inspect.signature(init)
+ params = list(sig.parameters.values())
+ assert init.__code__.co_argcount == 6
+ assert len(params) == 6
+ assert params[1].name == 'normal'
+ assert params[1].kind == params[1].POSITIONAL_OR_KEYWORD
+ assert params[1].default is None
+ assert params[2].name == '_source'
+ assert params[2].kind == params[2].POSITIONAL_OR_KEYWORD
+ assert params[2].default is None
+ assert params[3].name == '_classification'
+ assert params[3].kind == params[3].POSITIONAL_OR_KEYWORD
+ assert params[3].default is None
+ assert params[4].name == '_generated'
+ assert params[4].kind == params[4].POSITIONAL_OR_KEYWORD
+ assert params[4].default is None
+ assert params[5].name == '_version'
+ assert params[5].kind == params[5].POSITIONAL_OR_KEYWORD
+ assert params[5].default is None
+
+ Record = RecordDescriptor("test/a", [
+ ("uint32", "self"),
+ ("uint32", "cls"),
+ ])
+ r = Record(1, 2)
+ assert r.self == 1
+ assert r.cls == 2
+
+
+def test_record_stream(tmp_path):
+ Record = RecordDescriptor("test/counter", [
+ ("uint32", "counter"),
+ ("string", "tag"),
+ ])
+
+ datasets = [
+ tmp_path / "dataset1.records",
+ tmp_path / "dataset2.records.gz",
+ ]
+
+ for ds in datasets:
+ writer = RecordWriter(str(ds))
+ for i in range(100):
+ writer.write(Record(i, tag=ds.name))
+ writer.close()
+
+ datasets = [str(ds) for ds in datasets]
+ assert len(list(record_stream(datasets))) == len(datasets) * 100
+ assert len(list(record_stream(datasets, "r.counter == 42"))) == len(datasets)
+
+
+def test_record_replace():
+ TestRecord = RecordDescriptor("test/record", [
+ ("uint32", "index"),
+ ("string", "foo"),
+ ])
+
+ t = TestRecord(1, "hello")
+ assert t.index == 1
+ assert t.foo == "hello"
+
+ t2 = t._replace(foo="bar", index=1337)
+ assert t2.foo == "bar"
+ assert t2.index == 1337
+
+ t3 = t._replace()
+ assert t3.index == 1
+ assert t3.foo == "hello"
+ assert t3._source == t._source
+ assert t3._generated == t._generated
+ assert t3._version == t._version
+
+ t4 = t2._replace(foo="test", _source="pytest")
+ assert t4.index == 1337
+ assert t4.foo == "test"
+ assert t4._source == "pytest"
+ assert t4._generated == t2._generated
+
+ with pytest.raises(ValueError) as excinfo:
+ t._replace(foobar="keyword does not exist")
+ excinfo.match(".*Got unexpected field names:.*foobar.*")
+
+
+def test_record_init_from_record():
+ TestRecord = RecordDescriptor("test/record", [
+ ("uint32", "index"),
+ ("string", "foo"),
+ ])
+
+ t = TestRecord(1, "hello")
+ assert t.index == 1
+ assert t.foo == "hello"
+
+ TestRecord2 = TestRecord.extend([
+ ("string", "bar"),
+ ("uint32", "test"),
+ ])
+ t2 = TestRecord2.init_from_record(t)
+ assert t2.index == 1
+ assert t2.foo == "hello"
+ assert t2.bar is None
+ assert t2.test is None
+
+ t2.bar = "bar"
+ t2.test = 3
+ assert t2.bar == "bar"
+ assert t2.test == 3
+
+ TestRecord3 = RecordDescriptor("test/record3", [
+ ("string", "test"),
+ ("uint32", "count"),
+ ])
+ with pytest.raises(TypeError):
+ t3 = TestRecord3.init_from_record(t2, raise_unknown=True)
+
+ # explicit raise_unknown=False
+ t3 = TestRecord3.init_from_record(t2, raise_unknown=False)
+ assert t3.test == "3"
+ assert t3.count is None
+
+ # default should not raise either
+ t3 = TestRecord3.init_from_record(t2)
+ assert t3.test == "3"
+ assert t3.count is None
+
+
+def test_record_asdict():
+ Record = RecordDescriptor("test/a", [
+ ("string", "a_string"),
+ ("string", "common"),
+ ("uint32", "a_count"),
+ ])
+ record = Record("hello", "world", 1337)
+ rdict = record._asdict()
+ assert rdict.get("a_string") == "hello"
+ assert rdict.get("common") == "world"
+ assert rdict.get("a_count") == 1337
+ assert set(rdict) == set(["a_string", "common", "a_count", "_source", "_generated", "_version", "_classification"])
+
+ rdict = record._asdict(fields=["common", "_source", "a_string"])
+ assert set(rdict) == set(["a_string", "common", "_source"])
+
+ rdict = record._asdict(exclude=["a_count", "_source", "_generated", "_version"])
+ assert set(rdict) == set(["a_string", "common", "_classification"])
+
+ rdict = record._asdict(fields=["common", "_source", "a_string"], exclude=["common"])
+ assert set(rdict) == set(["a_string", "_source"])
+
+
+def test_recordfield_rewriter_expression():
+ rewriter = RecordFieldRewriter(expression="upper_a = a_string.upper(); count_times_10 = a_count * 10")
+ Record = RecordDescriptor("test/a", [
+ ("string", "a_string"),
+ ("string", "common"),
+ ("uint32", "a_count"),
+ ])
+ record = Record("hello", "world", 1337)
+ new_record = rewriter.rewrite(record)
+ assert new_record.a_string == "hello"
+ assert new_record.common == "world"
+ assert new_record.a_count == 1337
+ assert new_record.upper_a == "HELLO"
+ assert new_record.count_times_10 == 1337 * 10
+
+
+def test_recordfield_rewriter_fields():
+ rewriter = RecordFieldRewriter(fields=["a_count"])
+ Record = RecordDescriptor("test/a", [
+ ("string", "a_string"),
+ ("string", "common"),
+ ("uint32", "a_count"),
+ ])
+ record = Record("hello", "world", 1337)
+ new_record = rewriter.rewrite(record)
+ assert hasattr(new_record, "a_count")
+ assert not hasattr(new_record, "a_string")
+ assert not hasattr(new_record, "common")
+
+
+def test_extend_record():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "url"),
+ ("string", "query"),
+ ])
+ FooRecord = RecordDescriptor("test/foo", [
+ ("varint", "foo"),
+ ("bytes", "query"),
+ ("bytes", "bar"),
+ ])
+ HelloRecord = RecordDescriptor("test/hello", [
+ ("string", "hello"),
+ ("string", "world"),
+ ("string", "url"),
+ ])
+
+ a = TestRecord("http://flow.record", "myquery")
+ b = FooRecord(12345, b"FOO", b"BAR")
+ c = HelloRecord("hello", "world", "http://hello.world")
+
+ new = extend_record(a, [b, c])
+ assert new._desc == RecordDescriptor("test/record", [
+ ("string", "url"),
+ ("string", "query"),
+ ("varint", "foo"),
+ ("bytes", "bar"),
+ ("string", "hello"),
+ ("string", "world"),
+ ])
+ assert new.url == "http://flow.record"
+ assert new.query == "myquery"
+ assert new.foo == 12345
+ assert new.bar == b"BAR"
+ assert new.hello == "hello"
+ assert new.world == "world"
+
+ new = extend_record(a, [b, c], replace=True)
+ assert new._desc == RecordDescriptor("test/record", [
+ ("string", "url"),
+ ("bytes", "query"),
+ ("varint", "foo"),
+ ("bytes", "bar"),
+ ("string", "hello"),
+ ("string", "world"),
+ ])
+ assert new.url == "http://hello.world"
+ assert new.query == b"FOO"
+ assert new.foo == 12345
+ assert new.bar == b"BAR"
+ assert new.hello == "hello"
+ assert new.world == "world"
+
+
+def test_extend_record_with_replace():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "ip"),
+ ("uint16", "port"),
+ ("string", "data"),
+ ("string", "note"),
+ ])
+ ReplaceRecord = RecordDescriptor("test/foo", [
+ ("net.ipaddress", "ip"),
+ ("net.tcp.Port", "port"),
+ ("bytes", "data"),
+ ("string", "location"),
+ ])
+
+ a = TestRecord("10.13.13.17", 80, "HTTP/1.1 200 OK\r\n", "webserver")
+ b = ReplaceRecord(
+ ip=a.ip,
+ port=a.port,
+ data=a.data.encode(),
+ location="DMZ",
+ )
+ new = extend_record(a, [b], replace=False)
+ assert new.ip == "10.13.13.17"
+ assert new.port == 80
+ assert new.data == "HTTP/1.1 200 OK\r\n"
+ assert new.note == "webserver"
+ assert new.location == "DMZ"
+ assert isinstance(new.ip, str)
+ assert isinstance(new.port, int)
+ assert isinstance(new.data, str)
+ assert isinstance(new.note, str)
+ assert isinstance(new.location, str)
+ assert new._desc.name == "test/record"
+ assert " len(before)
+ assert len(before) == 3
+ assert len(after) == 6
+
+
+def test_record_archiver(tmpdir):
+ TestRecord = RecordDescriptor("test/record", [
+ ("uint32", "id"),
+ ])
+
+ records = [
+ TestRecord(id=1, _generated=datetime.datetime(2017, 12, 6, 22, 10)),
+ TestRecord(id=2, _generated=datetime.datetime(2017, 12, 6, 23, 59)),
+ TestRecord(id=3, _generated=datetime.datetime(2017, 12, 7, 00, 00)),
+ ]
+
+ p = tmpdir.mkdir("test")
+
+ writer = RecordArchiver(p, name="archive-test")
+ for rec in records:
+ writer.write(rec)
+ writer.close()
+
+ assert p.join("2017/12/06").check(dir=1)
+ assert p.join("2017/12/07").check(dir=1)
+
+ assert p.join("2017/12/06/archive-test-20171206T22.records.gz").check(file=1)
+ assert p.join("2017/12/06/archive-test-20171206T23.records.gz").check(file=1)
+ assert p.join("2017/12/07/archive-test-20171207T00.records.gz").check(file=1)
+
+ # test archiving
+ before = p.join("2017/12/06").listdir()
+ writer = RecordArchiver(p, name="archive-test")
+ for rec in records:
+ writer.write(rec)
+ writer.close()
+ after = p.join("2017/12/06").listdir()
+
+ assert set(before).issubset(set(after))
+ assert len(after) > len(before)
+ assert len(before) == 2
+ assert len(after) == 4
+
+
+def test_record_writer_stdout():
+ writer = RecordWriter()
+ assert writer.fp == getattr(sys.stdout, "buffer", sys.stdout)
+
+ writer = RecordWriter(None)
+ assert writer.fp == getattr(sys.stdout, "buffer", sys.stdout)
+
+ writer = RecordWriter("")
+ assert writer.fp == getattr(sys.stdout, "buffer", sys.stdout)
+
+ # We cannot test RecordReader() because it will read from stdin during init
+ # reader = RecordReader()
+ # assert reader.fp == sys.stdin
+
+
+def test_record_adapter_archive(tmpdir):
+ # archive some records, using "testing" as name
+ writer = RecordWriter("archive://{}?name=testing".format(tmpdir))
+ dt = datetime.datetime.utcnow()
+ count = 0
+ for rec in generate_records():
+ writer.write(rec)
+ count += 1
+ writer.close()
+
+ # defaults to always archive by /YEAR/MONTH/DAY/ dir structure
+ outdir = tmpdir.join("{ts:%Y/%m/%d}".format(ts=dt))
+ assert len(outdir.listdir())
+
+ # read the archived records and test filename and counts
+ count2 = 0
+ for fname in outdir.listdir():
+ assert fname.basename.startswith("testing-")
+ for rec in RecordReader(str(fname)):
+ count2 += 1
+ assert count == count2
+
+
+def test_record_pathlib(tmp_path):
+ # Test support for Pathlib/PathLike objects
+ writer = RecordWriter(tmp_path / "test.records")
+ for rec in generate_records(100):
+ writer.write(rec)
+ writer.close()
+
+ reader = RecordReader(tmp_path / "test.records")
+ assert len([rec for rec in reader]) == 100
+ assert not isinstance(tmp_path / "test.records", str)
+
+
+def test_record_pathlib_contextmanager(tmp_path):
+ with RecordWriter(tmp_path / "test.records") as writer:
+ for rec in generate_records(100):
+ writer.write(rec)
+
+ with RecordReader(tmp_path / "test.records") as reader:
+ assert len([rec for rec in reader]) == 100
+ assert not isinstance(tmp_path / "test.records", str)
+
+
+def test_record_pathlib_contextmanager_double_close(tmp_path):
+ with RecordWriter(tmp_path / "test.records") as writer:
+ for rec in generate_records(100):
+ writer.write(rec)
+ writer.close()
+
+ with RecordReader(tmp_path / "test.records") as reader:
+ assert len([rec for rec in reader]) == 100
+ reader.close()
+
+
+def test_record_invalid_recordstream(tmp_path):
+ path = str(tmp_path / "invalid_records")
+ with open(path, "wb") as f:
+ f.write(b"INVALID RECORD STREAM FILE")
+
+ with pytest.raises(IOError):
+ with RecordReader(path) as reader:
+ for r in reader:
+ assert(r)
+
+
+@pytest.mark.parametrize("adapter,contains", [
+ ("csvfile", (b"5,hello,world", b"count,foo,bar,")),
+ ("jsonfile", (b'"count": 5', )),
+ ("text", (b"count=5", )),
+ ("line", (b"count = 5", b"--[ RECORD 5 ]--")),
+])
+def test_record_adapter(adapter, contains, tmp_path):
+ TestRecord = RecordDescriptor("test/record", [
+ ("uint32", "count"),
+ ("string", "foo"),
+ ("string", "bar"),
+ ])
+
+ # construct the RecordWriter with uri
+ path = tmp_path / "output"
+ uri = "{adapter}://{path!s}".format(adapter=adapter, path=path)
+
+ # test parametrized contains
+ with RecordWriter(uri) as writer:
+ for i in range(10):
+ rec = TestRecord(count=i, foo="hello", bar="world")
+ writer.write(rec)
+ for pattern in contains:
+ assert pattern in path.read_bytes()
+
+ # test include (excludes everything else except in include)
+ with RecordWriter("{}?fields=count".format(uri)) as writer:
+ for i in range(10):
+ rec = TestRecord(count=i, foo="hello", bar="world")
+ writer.write(rec)
+
+ # test exclude
+ with RecordWriter("{}?exclude=count".format(uri)) as writer:
+ for i in range(10):
+ rec = TestRecord(count=i, foo="hello", bar="world")
+ writer.write(rec)
+
+
+def test_text_record_adapter(capsys):
+ TestRecordWithFooBar = RecordDescriptor("test/record", [
+ ("string", "name"),
+ ("string", "foo"),
+ ("string", "bar"),
+ ])
+ TestRecordWithoutFooBar = RecordDescriptor("test/record2", [
+ ("string", "name"),
+ ])
+ format_spec = "Hello {name}, {foo} is {bar}!"
+ with RecordWriter(f"text://?format_spec={format_spec}") as writer:
+ # Format string with existing variables
+ rec = TestRecordWithFooBar(name="world", foo="foo", bar="bar")
+ writer.write(rec)
+ out, err = capsys.readouterr()
+ assert "Hello world, foo is bar!\n" == out
+
+ # Format string with non-existing variables
+ rec = TestRecordWithoutFooBar(name="planet")
+ writer.write(rec)
+ out, err = capsys.readouterr()
+ assert "Hello planet, {foo} is {bar}!\n" == out
+
+
+def test_recordstream_header(tmp_path):
+ # Create and delete a RecordWriter, with nothing happening
+ p = tmp_path / "out.records"
+ writer = RecordWriter(p)
+ del(writer)
+ assert p.read_bytes() == b""
+
+ # RecordWriter via context manager, always flushes and closes afterwards
+ p = tmp_path / "out2.records"
+ with RecordWriter(p) as writer:
+ pass
+ assert p.read_bytes() == b"\x00\x00\x00\x0f\xc4\rRECORDSTREAM\n"
+
+ # Manual create of RecordWriter with no records and close (no flush)
+ p = tmp_path / "out3.records"
+ writer = RecordWriter(p)
+ writer.close()
+ assert p.read_bytes() == b""
+
+ # Manual RecordWriter with no records but flush and close
+ p = tmp_path / "out3.records"
+ writer = RecordWriter(p)
+ writer.flush()
+ writer.close()
+ assert p.read_bytes() == b"\x00\x00\x00\x0f\xc4\rRECORDSTREAM\n"
+
+ # Manual RecordWriter with some records written, we flush to ensure output due to buffering
+ p = tmp_path / "out4.records"
+ writer = RecordWriter(p)
+ writer.write(next(generate_records()))
+ writer.flush()
+ del(writer)
+ assert p.read_bytes().startswith(b"\x00\x00\x00\x0f\xc4\rRECORDSTREAM\n")
+
+
+def test_recordstream_header_stdout(capsysbinary):
+ with RecordWriter() as writer:
+ pass
+ out, err = capsysbinary.readouterr()
+ assert out == b"\x00\x00\x00\x0f\xc4\rRECORDSTREAM\n"
+
+ writer = RecordWriter()
+ del(writer)
+ out, err = capsysbinary.readouterr()
+ assert out == b""
+
+ writer = RecordWriter()
+ writer.close()
+ out, err = capsysbinary.readouterr()
+ assert out == b""
+
+ writer = RecordWriter()
+ writer.flush()
+ writer.close()
+ out, err = capsysbinary.readouterr()
+ assert out == b"\x00\x00\x00\x0f\xc4\rRECORDSTREAM\n"
diff --git a/tests/test_record_descriptor.py b/tests/test_record_descriptor.py
new file mode 100644
index 0000000..e9fde2b
--- /dev/null
+++ b/tests/test_record_descriptor.py
@@ -0,0 +1,142 @@
+import struct
+import hashlib
+
+from flow.record import RecordDescriptor
+from flow.record import RecordField
+
+
+def test_record_descriptor():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "url"),
+ ("string", "query"),
+ ("varint", "status"),
+ ])
+
+ # Get fields of type string
+ fields = TestRecord.getfields("string")
+ assert isinstance(fields, list)
+ assert len(fields) == 2
+ assert isinstance(fields[0], RecordField)
+ assert fields[0].typename == "string"
+ assert fields[0].name == "url"
+
+ # Get fields as tuples
+ fields = TestRecord.get_field_tuples()
+ assert isinstance(fields, tuple)
+ assert len(fields) == 3
+ assert isinstance(fields[0], tuple)
+ assert fields[0][0] == "string"
+ assert fields[0][1] == "url"
+
+
+def test_record_descriptor_clone():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "url"),
+ ("string", "query"),
+ ("varint", "status"),
+ ])
+
+ # Clone record descriptor
+ OtherRecord = RecordDescriptor("other/record", TestRecord)
+
+ assert TestRecord.name == "test/record"
+ assert OtherRecord.name == "other/record"
+ assert TestRecord.descriptor_hash != OtherRecord.descriptor_hash
+ assert TestRecord.get_field_tuples() == OtherRecord.get_field_tuples()
+
+
+def test_record_descriptor_extend():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "url"),
+ ("string", "query"),
+ ])
+
+ # Add field
+ ExtendedRecord = TestRecord.extend([("varint", "status")])
+
+ assert TestRecord.name == "test/record"
+ assert ExtendedRecord.name == "test/record"
+ assert TestRecord.descriptor_hash != ExtendedRecord.descriptor_hash
+ assert len(TestRecord.get_field_tuples()) == 2
+ assert len(ExtendedRecord.get_field_tuples()) == 3
+
+
+def test_record_descriptor_hash_cache():
+ # Get initial cache stats
+ TestRecord1 = RecordDescriptor("test/record", [
+ ("string", "url"),
+ ("string", "query"),
+ ])
+ info = RecordDescriptor.calc_descriptor_hash.cache_info()
+
+ # Create same descriptor, check cache hit increase
+ TestRecord2 = RecordDescriptor("test/record", [
+ ("string", "url"),
+ ("string", "query"),
+ ])
+ info2 = RecordDescriptor.calc_descriptor_hash.cache_info()
+ assert info2.hits == info.hits + 1
+ assert info.misses == info2.misses
+ assert TestRecord1.descriptor_hash == TestRecord2.descriptor_hash
+
+ # Create different descriptor, check for cache miss increase
+ TestRecord3 = RecordDescriptor("test/record", [
+ ("string", "url"),
+ ("string", "query"),
+ ("boolean", "test"),
+ ])
+ info3 = RecordDescriptor.calc_descriptor_hash.cache_info()
+ assert info2.hits == info.hits + 1
+ assert info3.misses == info.misses + 1
+ assert TestRecord2.descriptor_hash != TestRecord3.descriptor_hash
+
+
+def test_record_descriptor_hashing():
+ """ Test if hashing is still consistent to keep compatibility """
+ TestRecord = RecordDescriptor("test/hash", [
+ ("boolean", "one"),
+ ("string", "two"),
+ ])
+
+ # known good values from flow.record version 1.4.1
+ desc_hash = 1395243447
+ desc_bytes = b"test/hashonebooleantwostring"
+
+ # calculate
+ hash_digest = struct.unpack(">L", hashlib.sha256(desc_bytes).digest()[:4])[0]
+ assert desc_hash == hash_digest
+
+ # verify current implementation
+ assert TestRecord.descriptor_hash == hash_digest
+
+
+def test_record_descriptor_hash_eq():
+ """ Tests __hash__() on RecordDescriptor """
+ TestRecordSame1 = RecordDescriptor("test/same", [
+ ("boolean", "one"),
+ ("string", "two"),
+ ])
+
+ TestRecordSame2 = RecordDescriptor("test/same", [
+ ("boolean", "one"),
+ ("string", "two"),
+ ])
+
+ TestRecordDifferentName = RecordDescriptor("test/different", [
+ ("boolean", "one"),
+ ("string", "two"),
+ ])
+
+ TestRecordDifferentFields = RecordDescriptor("test/different", [
+ ("varint", "one"),
+ ("float", "two"),
+ ])
+
+ # __hash__
+ assert hash(TestRecordSame1) == hash(TestRecordSame2)
+ assert hash(TestRecordSame1) != hash(TestRecordDifferentName)
+
+ # __eq__
+ assert TestRecordSame1 == TestRecordSame2
+ assert TestRecordSame1 != TestRecordDifferentName
+ assert TestRecordDifferentName != TestRecordDifferentFields
diff --git a/tests/test_regression.py b/tests/test_regression.py
new file mode 100644
index 0000000..d1c9ea4
--- /dev/null
+++ b/tests/test_regression.py
@@ -0,0 +1,376 @@
+import pytest
+import codecs
+import os
+import datetime
+import sys
+
+import msgpack
+
+from flow.record import (
+ base,
+ whitelist,
+ fieldtypes,
+ Record,
+ GroupedRecord,
+ RecordDescriptor,
+ RecordPacker,
+ RECORD_VERSION,
+ RecordReader,
+ RecordWriter,
+)
+from flow.record.base import is_valid_field_name
+from flow.record.packer import RECORD_PACK_EXT_TYPE, RECORD_PACK_TYPE_RECORD
+from flow.record.selector import Selector, CompiledSelector
+
+
+def test_datetime_serialization():
+ packer = RecordPacker()
+
+ now = datetime.datetime.utcnow()
+
+ for tz in ["UTC", "Europe/Amsterdam"]:
+ os.environ["TZ"] = tz
+
+ descriptor = RecordDescriptor("""
+test/datetime
+ datetime datetime;
+""")
+
+ record = descriptor.recordType(datetime=now)
+ data = packer.pack(record)
+ r = packer.unpack(data)
+
+ assert r.datetime == now
+
+
+def test_long_int_serialization():
+ packer = RecordPacker()
+
+ long_types = RecordDescriptor("""
+test/long_types
+ varint long_type;
+ varint int_type;
+ varint long_type_neg;
+ varint int_type_neg;
+ varint max_int_as_long;
+ """)
+
+ l = 1239812398217398127398217389217389217398271398217321 # noqa: E741
+ i = 888888
+ lneg = -3239812398217398127398217389217389217398271398217321
+ ineg = -988888
+ max_int_as_long = sys.maxsize
+
+ record = long_types(l, i, lneg, ineg, max_int_as_long)
+ data = packer.pack(record)
+ r = packer.unpack(data)
+
+ assert r.long_type == l
+ assert r.int_type == i
+ assert r.long_type_neg == lneg
+ assert r.int_type_neg == ineg
+ assert r.max_int_as_long == max_int_as_long
+
+
+def test_unicode_serialization():
+ packer = RecordPacker()
+
+ descriptor = RecordDescriptor("""
+test/unicode
+ string text;
+""")
+
+ puny_domains = [b'xn--s7y.co', b'xn--80ak6aa92e.com', b'xn--pple-43d.com']
+
+ for p in puny_domains:
+ domain = codecs.decode(p, "idna")
+ record = descriptor.recordType(text=domain)
+ d = packer.pack(record)
+ record2 = packer.unpack(d)
+
+ assert record.text == record2.text
+ assert record.text == domain
+
+
+def test_pack_long_int_serialization():
+ packer = RecordPacker()
+ # test if 'long int' that fit in the 'int' type would be packed as int internally
+
+ max_neg_int = -0x8000000000000000
+ d = packer.pack([1234, 123456, max_neg_int, sys.maxsize])
+ assert d == b'\x94\xcd\x04\xd2\xce\x00\x01\xe2@\xd3\x80\x00\x00\x00\x00\x00\x00\x00\xcf\x7f\xff\xff\xff\xff\xff\xff\xff' # noqa: E501
+
+
+def test_non_existing_field():
+ # RecordDescriptor that is used to test locally in the Broker client
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "text"),
+ ])
+ x = TestRecord(text="Fox-IT, For a More Secure Society")
+
+ # r.content does not exist in the RecordDescriptor
+ assert Selector('lower("Fox-IT") in lower(r.content)').match(x) is False
+ assert Selector('"Fox-IT" in r.content').match(x) is False
+ # because the field does not exist, it will still evaluate to False even for negative matches
+ assert Selector('"Fox-IT" not in r.content').match(x) is False
+ assert Selector('"Fox-IT" in r.content').match(x) is False
+ assert Selector('"Fox-IT" != r.content').match(x) is False
+ assert Selector('"Fox-IT" == r.content').match(x) is False
+ assert Selector('r.content == "Fox-IT, For a More Secure Society"').match(x) is False
+ assert Selector('r.content != "Fox-IT, For a More Secure Society"').match(x) is False
+ assert Selector('r.content in "Fox-IT, For a More Secure Society!"').match(x) is False
+ assert Selector('r.content not in "Fox-IT, For a More Secure Society!"').match(x) is False
+
+ # r.text exist in the RecordDescriptor
+ assert Selector('"fox-it" in lower(r.text)').match(x)
+ assert Selector('r.text in "Fox-IT, For a More Secure Society!!"').match(x)
+ assert Selector('r.text == "Fox-IT, For a More Secure Society"').match(x)
+ assert Selector('r.text != "Fox-IT"').match(x)
+ assert Selector('lower("SECURE") in lower(r.text)').match(x)
+ assert Selector('"f0x-1t" not in lower(r.text)').match(x)
+ assert Selector('lower("NOT SECURE") not in lower(r.text)').match(x)
+
+
+def test_set_field_type():
+ TestRecord = RecordDescriptor("test/record", [
+ ("uint32", "value"),
+ ])
+
+ r = TestRecord(1)
+
+ assert isinstance(r.value, fieldtypes.uint32)
+ r.value = 2
+ assert isinstance(r.value, fieldtypes.uint32)
+
+ with pytest.raises(ValueError):
+ r.value = 'lalala'
+ r.value = 2
+
+ r = TestRecord()
+ assert r.value is None
+ r.value = 1234
+ assert r.value == 1234
+ with pytest.raises(TypeError):
+ r.value = [1, 2, 3, 4, 5]
+
+
+def test_packer_unpacker_none_values():
+ """Tests packing and unpacking of Empty records (default values of None)."""
+ packer = RecordPacker()
+
+ # construct field types from all available fieldtypes
+ field_tuples = []
+ for typename in whitelist.WHITELIST:
+ fieldname = "field_{}".format(typename.replace(".", "_").lower())
+ field_tuples.append((typename, fieldname))
+
+ # create a TestRecord descriptor containing all the fieldtypes
+ TestRecord = RecordDescriptor("test/empty_record", field_tuples)
+
+ # initialize an Empty record and serialize/deserialize
+ record = TestRecord()
+ data = packer.pack(record)
+ r = packer.unpack(data)
+ assert isinstance(r, Record)
+
+
+def test_fieldname_regression():
+ TestRecord = RecordDescriptor("test/uri_typed", [
+ ("string", "fieldname"),
+ ])
+ rec = TestRecord('omg regression')
+
+ assert rec in Selector("r.fieldname == 'omg regression'")
+
+ with pytest.raises(AttributeError):
+ assert rec not in Selector("fieldname == 'omg regression'")
+
+
+def test_version_field_regression():
+ packer = RecordPacker()
+ TestRecord = RecordDescriptor("test/record", [
+ ("uint32", "value"),
+ ])
+
+ r = TestRecord(1)
+
+ assert r.__slots__[-1] == '_version'
+
+ r._version = 256
+ data = packer.pack(r)
+ with pytest.warns(RuntimeWarning) as record:
+ packer.unpack(data)
+
+ assert len(record) == 1
+ assert record[0].message.args[0].startswith("Got old style record with no version information")
+
+ r._version = RECORD_VERSION + 1 if RECORD_VERSION < 255 else RECORD_VERSION - 1
+ data = packer.pack(r)
+ with pytest.warns(RuntimeWarning) as record:
+ packer.unpack(data)
+
+ assert len(record) == 1
+ assert record[0].message.args[0].startswith("Got other version record")
+
+
+def test_reserved_field_count_regression():
+ del base.RESERVED_FIELDS['_version']
+ base.RESERVED_FIELDS['_extra'] = 'varint'
+ base.RESERVED_FIELDS['_version'] = 'varint'
+
+ TestRecordExtra = RecordDescriptor("test/record", [
+ ("uint32", "value"),
+ ])
+
+ del base.RESERVED_FIELDS['_extra']
+
+ TestRecordBase = RecordDescriptor("test/record", [
+ ("uint32", "value"),
+ ])
+
+ packer = RecordPacker()
+ r = TestRecordExtra(1, _extra=1337)
+
+ assert r.value == 1
+ assert r._extra == 1337
+
+ data = packer.pack(r)
+ packer.register(TestRecordBase)
+
+ unpacked = packer.unpack(data)
+
+ with pytest.raises(AttributeError):
+ unpacked._extra
+
+ assert unpacked.value == 1
+ assert unpacked._version == 1
+
+
+def test_no_version_field_regression():
+ # Emulate old style record
+ packer = RecordPacker()
+ TestRecord = RecordDescriptor("test/record", [
+ ("uint32", "value"),
+ ])
+ packer.register(TestRecord)
+
+ r = TestRecord(1)
+
+ packed = r._pack()
+ mod = (packed[0], packed[1][:-1]) # Strip version field
+ rdata = packer.pack((RECORD_PACK_TYPE_RECORD, mod))
+ data = packer.pack(msgpack.ExtType(RECORD_PACK_EXT_TYPE, rdata))
+
+ with pytest.warns(RuntimeWarning) as record:
+ unpacked = packer.unpack(data)
+
+ assert len(record) == 1
+ assert record[0].message.args[0].startswith("Got old style record with no version information")
+
+ assert unpacked.value == 1
+ assert unpacked._version == 1 # Version field implicitly added
+
+
+def test_mixed_case_name():
+ assert is_valid_field_name("Test")
+ assert is_valid_field_name("test")
+ assert is_valid_field_name("TEST")
+
+ TestRecord = RecordDescriptor("Test/Record", [
+ ("uint32", "Value"),
+ ])
+
+ r = TestRecord(1)
+ assert r.Value == 1
+
+
+def test_multi_grouped_record_serialization(tmp_path):
+ TestRecord = RecordDescriptor("Test/Record", [
+ ("net.ipv4.Address", "ip"),
+ ])
+ GeoRecord = RecordDescriptor("geoip/country", [
+ ("string", "country"),
+ ("string", "city"),
+ ])
+ ASNRecord = RecordDescriptor("geoip/asn", [
+ ("string", "asn"),
+ ("string", "isp"),
+ ])
+
+ test_rec = TestRecord("1.3.3.7")
+ geo_rec = GeoRecord(country="Netherlands", city="Delft")
+
+ grouped_rec = GroupedRecord("grouped/geoip", [test_rec, geo_rec])
+ asn_rec = ASNRecord(asn="1337", isp="Cyberspace")
+ record = GroupedRecord("grouped/geo/asn", [grouped_rec, asn_rec])
+
+ assert record.ip == "1.3.3.7"
+ assert record.country == "Netherlands"
+ assert record.city == "Delft"
+ assert record.asn == "1337"
+ assert record.isp == "Cyberspace"
+
+ writer = RecordWriter(tmp_path / "out.record")
+ writer.write(record)
+ writer.close()
+
+ reader = RecordReader(tmp_path / "out.record")
+ records = list(reader)
+ assert len(records) == 1
+ record = records[0]
+ assert record.ip == "1.3.3.7"
+ assert record.country == "Netherlands"
+ assert record.city == "Delft"
+ assert record.asn == "1337"
+ assert record.isp == "Cyberspace"
+
+
+@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector])
+def test_ast_unicode_literals(PSelector):
+ TestRecord = RecordDescriptor("Test/Record", [])
+ assert TestRecord() in PSelector("get_type('string literal') == get_type(u'hello')")
+ assert TestRecord() in PSelector("get_type('not bytes') != get_type(b'hello')")
+
+
+def test_grouped_replace():
+ TestRecord = RecordDescriptor("test/adapter", [
+ ("uint32", "number"),
+ ])
+ OtherRecord = RecordDescriptor("test/other", [
+ ("string", "other"),
+ ])
+
+ # Constructing grouped record normally
+ record = TestRecord(number=1)
+ other_record = OtherRecord("foobar")
+ grouped_record = GroupedRecord("grouped/original", [record, other_record])
+ assert(grouped_record._source is None)
+ assert(grouped_record.number == 1)
+ assert(grouped_record.other == "foobar")
+
+ # Constructing grouped record normally (using a replaced record)
+ replaced_record = record._replace(_source="newsource")
+ grouped_record = GroupedRecord("grouped/replaced", [replaced_record, other_record])
+ assert(grouped_record._source == "newsource")
+ assert(grouped_record.number == 1)
+ assert(grouped_record.other == "foobar")
+
+ # Test GroupedRecord replace
+ replaced_grouped_record = grouped_record._replace(number=100)
+ assert(replaced_grouped_record.number == 100)
+ assert(replaced_grouped_record.other == "foobar")
+
+ # Test with multiple replacements
+ replaced_grouped_record = grouped_record._replace(number=200, other="a string", _source="testcase")
+ assert(replaced_grouped_record.number == 200)
+ assert(replaced_grouped_record.other == "a string")
+ assert(replaced_grouped_record._source == "testcase")
+
+ # Replacement with non existing field should raise a ValueError
+ with pytest.raises(ValueError) as excinfo:
+ grouped_record._replace(number=100, other="changed", non_existing_field="oops")
+ excinfo.match(".*Got unexpected field names:.*non_existing_field.*")
+
+
+if __name__ == "__main__":
+ __import__("standalone_test").main(globals())
diff --git a/tests/test_selector.py b/tests/test_selector.py
new file mode 100644
index 0000000..8a9fda1
--- /dev/null
+++ b/tests/test_selector.py
@@ -0,0 +1,504 @@
+from datetime import datetime
+
+import pytest
+
+from flow.record import RecordDescriptor
+from flow.record.selector import CompiledSelector, InvalidOperation, Selector
+
+
+def test_selector_func_name():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "query"),
+ ("string", "url"),
+ ])
+ assert TestRecord(None, None) not in Selector("name(r) == 'foo/bar'")
+ assert TestRecord(None, None) in Selector("name(r) == 'test/record'")
+
+
+def test_selector():
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "query"),
+ ("string", "url"),
+ ])
+ TestRecord2 = RecordDescriptor("test/record2", [
+ ("string", "key"),
+ ("string", "content"),
+ ])
+
+ assert TestRecord("foo", "bar") in Selector("r.query == 'foo'")
+ assert TestRecord(None, None) not in Selector("r.query == 'foo'")
+ assert TestRecord(None, None) not in Selector("name(r.query) == 'XX'")
+
+ with pytest.raises(InvalidOperation):
+ assert TestRecord(None, None) not in Selector("r.__class__ == 'str'")
+
+ s = Selector("lower(upper(r.content)) == 'xx'")
+ assert TestRecord("XX", "XX") not in s
+ assert TestRecord2("XX", "XX") in s
+
+ assert TestRecord(None, "BAR") in Selector(
+ "lower(r.query) == 'test' or lower(r.adsadsa) == 't' or lower(r.url) == 'bar'")
+
+ with pytest.raises(InvalidOperation):
+ assert TestRecord() in Selector("invalid_func(r.invalid_field, 1337) or r.id == 4")
+
+
+def test_selector_meta_query_true():
+ source = "internal/flow.record.test"
+
+ desc = RecordDescriptor("test/record", [
+ ("string", "value"),
+ ])
+ rec = desc("value", _source=source)
+ assert rec in Selector("r._source == '{}'".format(source))
+
+
+def test_selector_meta_query_false():
+ source = "internal/flow.record.test"
+
+ desc = RecordDescriptor("test/record", [
+ ("string", "value"),
+ ])
+ rec = desc("value", _source=source + "nope")
+ assert (rec in Selector("r._source == '{}'".format(source))) is False
+
+
+def test_selector_basic_query_true():
+ md5hash = "My MD5 hash!"
+
+ desc = RecordDescriptor("test/md5_hash", [
+ ("string", "md5"),
+ ])
+ rec = desc(md5hash)
+ assert rec in Selector("r.md5 == '{}'".format(md5hash))
+
+
+def test_selector_basic_query_false():
+ md5hash = "My MD5 hash!"
+
+ desc = RecordDescriptor("test/md5_hash", [
+ ("string", "md5"),
+ ])
+ rec = desc(md5hash + "nope")
+ assert (rec in Selector("r.md5 == '{}'".format(md5hash))) is False
+
+
+def test_selector_non_existing_field():
+ md5hash = "My MD5 hash!"
+
+ desc = RecordDescriptor("test/md5_hash", [
+ ("string", "md5"),
+ ])
+ rec = desc(md5hash)
+ assert (rec in Selector("r.non_existing_field == 1337")) is False
+
+
+# [MS] Disabled, list types?
+# def test_selector_string_in_array():
+# obj = Expando()
+# obj.filenames = ['record_mitchel_keystrokes.exe', 'python.exe', 'chrome.exe']
+
+# s = Selector("'{}' in r.filenames".format(obj.filenames[0]))
+# assert (obj in s) is True
+
+
+def test_selector_string_contains():
+ desc = RecordDescriptor("test/filetype", [
+ ("string", "filetype"),
+ ])
+ rec = desc('PE32 executable (GUI) Intel 80386, for MS Windows')
+
+ assert rec in Selector("'PE' in r.filetype")
+
+
+def test_selector_not_in_operator():
+ desc = RecordDescriptor("test/md5_hash", [
+ ("string", "filetype"),
+ ])
+ rec = desc('PE32 executable (GUI) Intel 80386, for MS Windows')
+
+ assert rec in Selector("'ELF' not in r.filetype")
+
+
+def test_selector_or_operator():
+ desc = RecordDescriptor("test/filetype", [
+ ("string", "filetype"),
+ ])
+ rec = desc('PE32 executable (GUI) Intel 80386, for MS Windows')
+
+ assert rec in Selector("'PE32' in r.filetype or 'PE64' in r.xxxx")
+
+
+def test_selector_and_operator():
+ desc = RecordDescriptor("test/filetype", [
+ ("string", "filetype"),
+ ("string", "xxxx"),
+ ])
+
+ rec = desc('PE32 executable (GUI) Intel 80386, for MS Windows', 'PE32 executable (GUI) Intel 80386, for MS Windows')
+
+ assert rec in Selector("'PE32' in r.filetype and 'PE32' in r.xxxx")
+
+
+def test_selector_in_function():
+ desc = RecordDescriptor("test/filetype", [
+ ("string", "filetype"),
+ ])
+ rec = desc('PE32 executable (GUI) Intel 80386, for MS Windows')
+
+ assert rec in Selector("'pe' in lower(r.filetype)")
+
+
+def test_selector_function_call_whitelisting():
+ TestRecord = RecordDescriptor("test/filetype", [
+ ("string", "filetype"),
+ ])
+ rec = TestRecord('PE32 executable (GUI) Intel 80386, for MS Windows')
+
+ # We allow explicitly exposed functions
+ assert rec in Selector("'pe32' in lower(r.filetype)")
+ # But functions on types are not
+ with pytest.raises(Exception) as excinfo:
+ rec in Selector("'pe' in r.filetype.lower()")
+
+ assert rec in Selector("'EXECUTABLE' in upper(r.filetype)")
+ with pytest.raises(Exception) as excinfo:
+ rec in Selector("'EXECUTABLE' in r.filetype.upper()")
+
+ IPRecord = RecordDescriptor("test/address", [
+ ("net.ipv4.Address", "ip"),
+ ])
+ rec = IPRecord("192.168.1.1")
+ assert rec in Selector("r.ip in net.ipv4.Subnet('192.168.1.0/24')")
+ assert rec not in Selector("r.non_existing_field in net.ipv4.Subnet('192.168.1.0/24')")
+
+ # We call net.ipv4 instead of net.ipv4.Subnet, which should fail
+ with pytest.raises(Exception) as excinfo:
+ assert rec in Selector("r.ip in net.ipv4('192.168.1.0/24')")
+ excinfo.match("Call 'net.ipv4' not allowed. No calls other then whitelisted 'global' calls allowed!")
+
+
+def test_selector_subnet():
+ desc = RecordDescriptor("test/ip", [
+ ("net.ipv4.Address", "ip"),
+ ])
+ rec = desc('192.168.10.1')
+
+ assert rec in Selector("r.ip in net.ipv4.Subnet('192.168.10.1/32')")
+ assert rec in Selector("r.ip in net.ipv4.Subnet('192.168.10.0/24')")
+ assert rec in Selector("r.ip in net.ipv4.Subnet('192.168.0.0/16')")
+ assert rec in Selector("r.ip in net.ipv4.Subnet('192.0.0.0/8')")
+ assert rec in Selector("r.ip in net.ipv4.Subnet('192.168.10.1')")
+ assert rec in Selector("r.ip not in net.ipv4.Subnet('10.0.0.0/8')")
+
+
+def test_field_equals():
+ desc = RecordDescriptor("test/record", [
+ ("string", "mailfrom"),
+ ("string", "mailto"),
+ ("string", "foo"),
+ ])
+ rec = desc("hello@world.com", "foo@bar.com", "testing")
+ assert rec in CompiledSelector("field_equals(r, ['mailfrom', 'mailto'], ['hello@world.com',])")
+ assert rec in CompiledSelector("field_equals(r, ['mailfrom', 'mailto'], ['hElLo@WoRlD.com',])")
+ assert rec not in CompiledSelector("field_equals(r, ['mailfrom', 'mailto'], ['hElLo@WoRlD.com',], nocase=False)")
+ assert rec not in CompiledSelector("field_equals(r, ['mailfrom', 'mailto'], ['hello',])")
+
+
+def test_field_contains():
+ desc = RecordDescriptor("test/record", [
+ ("string", "mailfrom"),
+ ("string", "mailto"),
+ ("string", "foo"),
+ ])
+ rec = desc("hello@world.com", "foo@bar.com", "testing")
+ rec2 = desc("hello@world.com", "foo@bar.com")
+
+ assert rec in CompiledSelector("field_contains(r, ['mailfrom', 'mailto'], ['foo@bar.com', 'test@fox-it.com'])")
+ assert rec in CompiledSelector("field_contains(r, ['mailfrom', 'mailto'], ['FOO', 'HELLO'])")
+ assert rec in Selector("field_contains(r, ['mailfrom', 'mailto'], ['FOO', 'HELLO'])")
+ assert rec2 not in CompiledSelector("field_contains(r, ['testing'], ['TEST@fox-it.com'])")
+
+
+def test_field_contains_word_boundary():
+ desc = RecordDescriptor("test/record", [
+ ("string", "mailfrom"),
+ ("string", "mailto"),
+ ("string", "foo"),
+ ("string", "content"),
+ ])
+ rec = desc("hello@world.com", "foo@bar.com", "testing", "This is a testing string")
+ rec2 = desc("helloworld@world.com", "foo@bar.com")
+ rec3 = desc(None, None)
+ rec4 = desc(None, None, "hello@world.com")
+ rec5 = desc()
+ assert rec in Selector(
+ "field_contains(r, ['mailfrom', 'mailto'], ['hello'], word_boundary=True)")
+ assert rec not in Selector(
+ "field_contains(r, ['mailfrom', 'mailto'], ['hello.'], word_boundary=True)") # Check regex escaping...
+ assert rec not in Selector(
+ "field_contains(r, ['mailfrom', 'mailto'], ['HELLO'], nocase=False, word_boundary=True)")
+ assert rec2 not in Selector(
+ "field_contains(r, ['mailfrom', 'mailto'], ['hello'], word_boundary=True)")
+ assert rec2 not in Selector(
+ "field_contains(r, ['mailfrom', 'mailto', 'nonexistingfield'], ['hello'], word_boundary=True)")
+ assert rec3 not in Selector(
+ "field_contains(r, ['mailfrom', 'mailto'], ['hello'], word_boundary=True)")
+ assert rec4 in Selector(
+ "field_contains(r, ['mailfrom', 'mailto', 'foo'], ['hello'], word_boundary=True)")
+ assert rec5 not in Selector(
+ "field_contains(r, ['mailfrom', 'mailto', 'foo'], ['hello'], word_boundary=True)")
+
+ assert rec not in Selector("field_contains(r, ['content'], ['sting'], word_boundary=True)")
+ assert rec in Selector("field_contains(r, ['content'], ['testing'], word_boundary=True)")
+
+
+def test_field_regex():
+ desc = RecordDescriptor("test/record", [
+ ("string", "mailfrom"),
+ ("string", "mailto"),
+ ("string", "foo"),
+ ])
+ rec = desc("hello@world.com", "foo@bar.com", "testing")
+
+ assert rec in Selector(r"field_regex(r, ['mailfrom', 'mailto'], r'.+@.+\.com')")
+ assert rec in CompiledSelector(r"field_regex(r, ['mailfrom', 'mailto'], r'.+@.+\.com')")
+ assert rec not in Selector("field_regex(r, ['mailfrom', 'mailto'], r'.+@fox-it.com')")
+ assert rec not in CompiledSelector("field_regex(r, ['mailfrom', 'mailto'], r'.+@fox-it.com')")
+
+
+def test_selector_uri():
+ TestRecord = RecordDescriptor("test/uri", [
+ ("uri", "uri"),
+ ])
+ rec = TestRecord('http://www.google.com/evil.bin')
+ assert rec in Selector("r.uri.filename in ['evil.bin', 'foo.bar']")
+
+
+def test_selector_typed():
+ TestRecord = RecordDescriptor("test/uri_typed", [
+ ("uri", "urifield1"),
+ ("uri", "urifield2"),
+ ("string", "stringfield"),
+ ])
+ rec = TestRecord('helloworld.exe', 'another.bin', 'Fox-IT')
+ assert rec in Selector("Type.uri.filename == 'helloworld.exe'")
+ assert rec in CompiledSelector("Type.uri.filename == 'helloworld.exe'")
+ assert rec in Selector("Type.uri.filename != 'howdyworld.exe'")
+ assert rec in CompiledSelector("Type.uri.filename != 'howdyworld.exe'")
+ assert rec in Selector("'another' in Type.uri.filename")
+ assert rec in CompiledSelector("'another' in Type.uri.filename")
+ assert rec in Selector("field_contains(r, Type.uri.filename, ['hello'])")
+ assert rec in CompiledSelector("field_contains(r, Type.uri.filename, ['hello'])")
+ assert rec in Selector("field_equals(r, Type.uri.filename, ['another.bin'])")
+ assert rec in CompiledSelector("field_equals(r, Type.uri.filename, ['another.bin'])")
+ assert rec in Selector(r"field_regex(r, Type.uri.filename, r'hello\w{5}.exe')")
+ assert rec in CompiledSelector(r"field_regex(r, Type.uri.filename, r'hello\w{5}.exe')")
+
+ # Test TypeMatcher reuse
+ assert rec in Selector("Type.uri.filename == 'helloworld.exe' or Type.uri.filename == 'another.bin'")
+ assert rec in CompiledSelector("Type.uri.filename == 'helloworld.exe' or Type.uri.filename == 'another.bin'")
+
+ assert rec in Selector("Type.string == 'Fox-IT'")
+ assert rec in CompiledSelector("Type.string == 'Fox-IT'")
+ assert rec in Selector("field_equals(r, Type.string, ['Fox-IT'])")
+ assert rec in CompiledSelector("field_equals(r, Type.string, ['Fox-IT'])")
+ assert rec in Selector("field_contains(r, Type.string, ['Fox'])")
+ assert rec in CompiledSelector("field_contains(r, Type.string, ['Fox'])")
+ assert rec in Selector(r"field_regex(r, Type.string, r'Fox-\w{2}')")
+ assert rec in CompiledSelector(r"field_regex(r, Type.string, r'Fox-\w{2}')")
+
+ assert rec not in Selector("Type.filename == 'lalala'")
+ assert rec not in CompiledSelector("Type.filename == 'lalala'")
+ assert rec not in Selector("Type.uri.filename == 'lalala'")
+ assert rec not in CompiledSelector("Type.uri.filename == 'lalala'")
+ assert rec not in Selector("field_contains(r, Type.uri.filename, ['nope'])")
+ assert rec not in CompiledSelector("field_contains(r, Type.uri.filename, ['nope'])")
+ assert rec not in Selector("field_equals(r, Type.uri.filename, ['nope'])")
+ assert rec not in CompiledSelector("field_equals(r, Type.uri.filename, ['nope'])")
+ assert rec not in Selector("field_regex(r, Type.uri.filename, 'nope')")
+ assert rec not in CompiledSelector("field_regex(r, Type.uri.filename, 'nope')")
+
+ TestNamespaceRecord = RecordDescriptor("test/ip", [
+ ("net.ipv4.Address", "ip"),
+ ])
+ rec = TestNamespaceRecord('192.168.10.1')
+
+ # This will only work in "normal" selectors, because we need to override the behaviour
+ # of the __contains__ operator to unwrap the requested values
+ assert rec in Selector("Type.net.ipv4.Address in net.ipv4.Subnet('192.168.10.1/32')")
+ assert rec in Selector("Type.net.ipv4.Address in net.ipv4.Subnet('192.168.10.0/24')")
+ assert rec in Selector("Type.net.ipv4.Address in net.ipv4.Subnet('192.168.0.0/16')")
+ assert rec in Selector("Type.net.ipv4.Address in net.ipv4.Subnet('192.0.0.0/8')")
+ assert rec in Selector("Type.net.ipv4.Address in net.ipv4.Subnet('192.168.10.1')")
+ assert rec in Selector("Type.net.ipv4.Address not in net.ipv4.Subnet('10.0.0.0/8')")
+
+ with pytest.raises(InvalidOperation):
+ assert rec in Selector("Type.uri.filename.__class__ == 'invalid'")
+
+
+def test_selector_unicode():
+ TestRecord = RecordDescriptor("test/string", [
+ ("string", "name"),
+ ])
+ rec = TestRecord("Jack O'Neill")
+ assert rec not in Selector("field_contains(r, ['name'], [u'Jack O\u2019Neill'])")
+
+ rec = TestRecord(u"jack o\u2019neill")
+ assert rec in Selector("field_contains(r, ['name'], [u'Jack O\u2019Neill'])")
+
+
+def test_record_in_records():
+ RecordA = RecordDescriptor("test/record_a", [
+ ("datetime", "some_dt"),
+ ("string", "field"),
+ ])
+ RecordB = RecordDescriptor("test/record_b", [
+ ("record", "record"),
+ ("datetime", "some_dt"),
+ ])
+ RecordC = RecordDescriptor("test/record_c", [
+ ("record[]", "records"),
+ ])
+ RecordD = RecordDescriptor("test/record_d", [
+ ("string[]", "stringlist"),
+ ])
+
+ test_str = "this is a test"
+ dt = datetime.utcnow()
+ record_a = RecordA(
+ some_dt=dt,
+ field=test_str)
+ record_b = RecordB(
+ record=record_a,
+ some_dt=dt)
+
+ subrecords = []
+ record_d = None
+ for i in range(10):
+ record_d = RecordD(
+ stringlist=["aap", "noot", "mies", "Subrecord {}".format(i)])
+ subrecords.append(record_d)
+
+ subrecords.append(record_a)
+ record_c = RecordC(
+ records=subrecords)
+
+ subrecords.append(None)
+ record_c_with_none_values = RecordC(
+ records=subrecords)
+
+ assert record_b in Selector("r.record.field == '{}'".format(test_str))
+ assert record_b in Selector("Type.string == '{}'".format(test_str))
+ assert record_c in Selector("Type.string == '{}'".format(test_str))
+ assert record_d in Selector("any(s == 'Subrecord 9' for s in r.stringlist)")
+ assert record_c in Selector("any(s == 'Subrecord 9' for e in r.records for s in e.stringlist)")
+ assert record_c_with_none_values in Selector(
+ "any(s == 'Subrecord 9' for e in r.records for s in e.stringlist)")
+ assert record_d not in Selector("any(s == 'Subrecord 9' for s in r.nonexistingfield)")
+
+
+@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector])
+def test_non_existing_field(PSelector):
+ TestRecord = RecordDescriptor("test/record", [
+ ("string", "query"),
+ ("string", "url"),
+ ])
+
+ assert TestRecord("foo", "bar") not in PSelector("r.query and r.non_existing_field")
+ assert TestRecord("foo", "bar") in PSelector("not r.non_existing_field")
+ assert TestRecord("foo", "bar") in PSelector("r.query and r.url and not r.non_existing_field")
+
+
+@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector])
+def test_selector_modulo(PSelector):
+ TestRecord = RecordDescriptor("test/record", [
+ ("varint", "counter"),
+ ])
+
+ records = []
+ for i in range(300):
+ records.append(TestRecord(i))
+
+ selected = [rec for rec in records if rec in PSelector("r.counter % 10 == 0")]
+ assert len(selected) == 30
+
+ for rec in records:
+ sel = PSelector("r.counter % 10 == 0")
+ if rec.counter % 10 == 0:
+ assert rec in sel
+ else:
+ assert rec not in sel
+
+
+@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector])
+def test_selector_bit_and(PSelector):
+ TestRecord = RecordDescriptor("test/record", [
+ ("varint", "counter"),
+ ])
+
+ records = []
+ for i in range(300):
+ records.append(TestRecord(i))
+
+ for rec in records:
+ sel = PSelector("(r.counter & 0x0F) == 1")
+ if rec.counter & 0x0F == 1:
+ assert rec in sel
+ else:
+ assert rec not in sel
+
+
+@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector])
+def test_selector_bit_or(PSelector):
+ TestRecord = RecordDescriptor("test/record", [
+ ("varint", "counter"),
+ ])
+
+ records = []
+ for i in range(300):
+ records.append(TestRecord(i))
+
+ for rec in records:
+ sel = PSelector("(r.counter | 0x10) == 0x11")
+ if rec.counter | 0x10 == 0x11:
+ assert rec in sel
+ else:
+ assert rec not in sel
+
+
+@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector])
+def test_selector_modulo_non_existing_field(PSelector):
+ TestRecord = RecordDescriptor("test/record", [
+ ("varint", "counter"),
+ ])
+
+ records = []
+ for i in range(300):
+ records.append(TestRecord(i))
+
+ sel = PSelector("r.counter % 10 == 0")
+ for rec in records:
+ if rec.counter % 10 == 0:
+ assert rec in sel
+ else:
+ assert rec not in sel
+
+ # Test with non existing fields
+ # using has_field() ensures that this works with CompiledSelector and Selector
+ sel = PSelector("has_field(r, 'counterz') and r.counterz % 10 == 0")
+ for rec in records:
+ if hasattr(rec, "counterz") and rec.counterz % 10 == 0:
+ assert rec in sel
+ else:
+ assert rec not in sel
+
+ # non existing field but without the precheck (this does not work with CompiledSelector)
+ if isinstance(PSelector, Selector):
+ sel = PSelector("r.counterz % 10 == 0")
+ for rec in records:
+ assert rec not in sel
+
+
+if __name__ == "__main__":
+ __import__("standalone_test").main(globals())
diff --git a/tests/test_splunk_adapter.py b/tests/test_splunk_adapter.py
new file mode 100644
index 0000000..38c910b
--- /dev/null
+++ b/tests/test_splunk_adapter.py
@@ -0,0 +1,112 @@
+from unittest import mock
+
+from flow.record import RecordDescriptor
+import flow.record.adapter.splunk
+from flow.record.adapter.splunk import splunkify
+
+
+def test_splunkify_reserved_field():
+
+ with mock.patch.object(
+ flow.record.adapter.splunk,
+ "RESERVED_SPLUNK_FIELDS",
+ set(["foo"])
+ ):
+ test_record_descriptor = RecordDescriptor(
+ "test/record",
+ [("string", "foo")]
+ )
+
+ test_record = test_record_descriptor(foo="bar")
+
+ output = splunkify(test_record)
+ assert output == 'type="test/record" rdtag=None rd_foo="bar"'
+
+
+def test_splunkify_normal_field():
+
+ with mock.patch.object(
+ flow.record.adapter.splunk,
+ "RESERVED_SPLUNK_FIELDS",
+ set()
+ ):
+ test_record_descriptor = RecordDescriptor(
+ "test/record",
+ [("string", "foo")]
+ )
+
+ test_record = test_record_descriptor(foo="bar")
+
+ output = splunkify(test_record)
+ assert output == 'type="test/record" rdtag=None foo="bar"'
+
+
+def test_splunkify_rdtag_field():
+
+ with mock.patch.object(
+ flow.record.adapter.splunk,
+ "RESERVED_SPLUNK_FIELDS",
+ set()
+ ):
+ test_record_descriptor = RecordDescriptor(
+ "test/record",
+ )
+
+ test_record = test_record_descriptor()
+
+ output = splunkify(test_record, tag="bar")
+ assert output == 'type="test/record" rdtag="bar"'
+
+
+def test_splunkify_none_field():
+
+ with mock.patch.object(
+ flow.record.adapter.splunk,
+ "RESERVED_SPLUNK_FIELDS",
+ set()
+ ):
+ test_record_descriptor = RecordDescriptor(
+ "test/record",
+ [("string", "foo")]
+ )
+
+ test_record = test_record_descriptor()
+
+ output = splunkify(test_record)
+ assert output == 'type="test/record" rdtag=None foo=None'
+
+
+def test_splunkify_byte_field():
+
+ with mock.patch.object(
+ flow.record.adapter.splunk,
+ "RESERVED_SPLUNK_FIELDS",
+ set()
+ ):
+ test_record_descriptor = RecordDescriptor(
+ "test/record",
+ [("bytes", "foo")]
+ )
+
+ test_record = test_record_descriptor(foo=b"bar")
+
+ output = splunkify(test_record)
+ assert output == 'type="test/record" rdtag=None foo="YmFy"'
+
+
+def test_splunkify_backslash_quote_field():
+
+ with mock.patch.object(
+ flow.record.adapter.splunk,
+ "RESERVED_SPLUNK_FIELDS",
+ set()
+ ):
+ test_record_descriptor = RecordDescriptor(
+ "test/record",
+ [("string", "foo")]
+ )
+
+ test_record = test_record_descriptor(foo=b"\\\"")
+
+ output = splunkify(test_record)
+ assert output == 'type="test/record" rdtag=None foo="\\\\\\""'
diff --git a/tests/utils_inspect.py b/tests/utils_inspect.py
new file mode 100644
index 0000000..4427491
--- /dev/null
+++ b/tests/utils_inspect.py
@@ -0,0 +1,58 @@
+"""
+Backport of `inspect.signature` for Python 2.
+
+Based on: https://github.com/python/cpython/blob/3.7/Lib/inspect.py
+"""
+
+import inspect
+import collections
+
+
+class _empty:
+ pass
+
+
+class Parameter:
+ POSITIONAL_ONLY = 0
+ POSITIONAL_OR_KEYWORD = 1
+ VAR_POSITIONAL = 2
+ KEYWORD_ONLY = 3
+ VAR_KEYWORD = 4
+
+ empty = _empty
+
+ def __init__(self, name, kind, default=_empty):
+ self.name = name
+ self.kind = kind
+ self.default = default
+
+
+class Signature:
+ empty = _empty
+
+ def __init__(self, parameters=None):
+ self.parameters = parameters
+
+
+def signature(obj):
+ try:
+ # Python 3
+ return inspect.signature(obj)
+ except AttributeError:
+ # Python 2
+ spec = inspect.getargspec(obj)
+
+ # Create parameter objects which are compatible with python 3 objects
+ parameters = collections.OrderedDict()
+ for i in range(0, len(spec.args)):
+ arg = spec.args[i]
+ default = _empty
+ if spec.defaults and (len(spec.args) - i <= len(spec.defaults)):
+ default = spec.defaults[i - len(spec.args)]
+ parameters[arg] = Parameter(name=arg, default=default, kind=Parameter.POSITIONAL_OR_KEYWORD)
+ if spec.varargs:
+ parameters[spec.varargs] = Parameter(name=spec.varargs, kind=Parameter.VAR_POSITIONAL)
+ if spec.keywords:
+ parameters[spec.keywords] = Parameter(name=spec.keywords, kind=Parameter.VAR_KEYWORD)
+
+ return Signature(parameters=parameters)
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..7293d76
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,58 @@
+[tox]
+envlist = lint, py3, pypy3
+# This version of tox will autoprovision itself and the requirements defined in
+# requires if they are not available on the host system.
+minversion = 3.8.0
+# This version of virtualenv installs a pip version of at least 19.0.1 in its
+# venvs.
+# Requiring minimally this version of virtualenv to be available prevents the
+# need of having to explicitly specify a pip>=19.0 dependency in every testenv.
+# pip>=19.0 is needed to ensure the sdist build by tox (which is build
+# according to PEP 517 and PEP 518 by tox versions >= 3.4.0) is also installed
+# properly (according to PEP 517 and PEP 518 by pip>=19.0) in the virtualenvs.
+# If the dependency is not available on the host system, and the installed tox
+# version is >= 3.3.0, tox will self bootstrap an environment with the proper
+# versions (including the version of tox itself).
+requires = virtualenv>=16.3.0
+isolated_build = true
+# Putting the dist dir in the project directory instead of in the {toxworkdir},
+# makes the sdist more easily accesible and prevents the need of rebuilding it
+# for the [testenv:build] target.
+distdir = {toxinidir}/dist
+
+[testenv]
+deps =
+ pytest
+ pytest-cov
+ coverage
+commands =
+# Capturing output will fail on pypy, possibly due to this issue: https://github.com/pytest-dev/pytest/issues/5502
+ pytest --basetemp="{envtmpdir}" {posargs:--color=yes --capture=no --cov=flow --cov-report=term-missing -v tests}
+ coverage report
+ coverage xml
+
+[testenv:lint]
+# Force the Python version here, so linting will be done with the correct
+# Python version. There should be no difference between the CPython and pypy
+# implementations, so we pick one.
+basepython = python3
+deps =
+ flake8
+commands =
+ flake8 flow tests setup.py
+
+[testenv:build]
+# Force the Python version here, so building will be done with the correct
+# Python version. As the distributions are pure Python, there should be no
+# difference between the CPython and pypy implementations, so we pick one.
+basepython = python3
+deps =
+commands =
+ pip wheel --no-deps -w ./dist .
+
+[flake8]
+max-line-length = 120
+extend-ignore =
+ # See https://github.com/PyCQA/pycodestyle/issues/373
+ E203,
+statistics = True