Skip to end of metadata
Go to start of metadata

You are viewing an old version of this page. View the current version.

Compare with Current View Page History

« Previous Version 3 Current »

This is a pipelet to support the list of format strings on the Squirro UI.

You may upload it to your Squirro instance, and then you can add it in the Load section of your pipeline workflows, before the Transform Input step (if it is present).

Please read the docstring of the pipelet for an example on how to configure it.

from squirro.sdk import PipeletV1
from datetime import datetime


SQUIRRO_DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"


class TimeFormatsPipelet(PipeletV1):
    """Parse values of datetime fields using a list of time formats.

    This pipelet can be used when you want to create a label (facet) for
    a datetime source field which might include values in more than one
    formats.
    
    For example, there is a source field called `date_info` which stores
    datetime infomation, which takes the following three values:
    13/09/2022, 13:05:18, 2022-09-13T13:05:18
    All three values follow a different format, with some even missing some
    information (like date or time part).
    
    You can use this pipelet as the first step of your pipeline workflow
    (before the `Transform Input` step if present), and configure it as follows:
    ```
    {
      "date_info": [
        "%d/%m/%Y",
        "%H:%M:%S"
      ]
    }
    ```
    """

    def getArguments():
        return [
            {
                "name": "source_field_time_formats_map",
                "display_label": "Source Field to Time Formats map",
                "help": "Dictionary which maps source field names to time formats to try out in order to successfully parse their values.",
                "type": "code",
                "syntax": "json",
            },
        ]


    def __init__(self, config):
        self.config = config

    def consume(self, item):
        source_field_time_formats_map = self.config.get("source_field_time_formats_map", {})

        for key, value in item.items():

            if key in source_field_time_formats_map:

                time_formats, value_dt = source_field_time_formats_map[key], None

                for time_format in time_formats + [SQUIRRO_DATE_FORMAT]:
                    try:
                        value_dt = datetime.strptime(value, time_format)
                    except ValueError:
                        pass
                    else:
                        break

                if not value_dt:
                    raise ValueError(f"Could not parse value {value} with any of the configured formats: {time_formats}")
                item[key] = value_dt.strftime(SQUIRRO_DATE_FORMAT)

        return item

  • No labels