Merging next to main for release 2.5.3 (#25)

* [PAPP-33470] Bugfix: Recurrent multipart parsing (#20) * recurrent multipart parsing * Update README.md * linting fix * fix for "in_attachment" traversal * ph_status -> ret_val * applying suggestions --------- Co-authored-by: splunk-soar-connectors-admin <admin@splunksoar> * Bumped up the version of googleworkspaceforgmail from 2.5.2 to 2.5.3 * proper release notes, function naming convention (#24) * Release notes for version 2.5.3 --------- Co-authored-by: mposluszny-splunk <150343546+mposluszny-splunk@users.noreply.github.com> Co-authored-by: splunk-soar-connectors-admin <admin@splunksoar> Co-authored-by: root <root@splunksoar>
splunk-soar-connectors · Apr 8, 2024 · ee34a49 · ee34a49
1 parent 6134570
commit ee34a49
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 74 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 # G Suite for GMail
 
 Publisher: Splunk  
-Connector Version: 2.5.2  
+Connector Version: 2.5.3  
 Product Vendor: Google  
 Product Name: GMail  
 Product Version Supported (regex): ".\*"  
@@ -295,6 +295,7 @@ PARAMETER | REQUIRED | DESCRIPTION | TYPE | CONTAINS
 **email** |  required  | User's Email (Mailbox to search) | string |  `email` 
 **internet_message_id** |  required  | Internet Message ID | string |  `internet message id` 
 **extract_attachments** |  optional  | Add attachments to vault and create vault artifacts | boolean | 
+**extract_nested** |  optional  | Works when `extract_attachments` is set to `true`. Extracts attachments from nested email attachments. | boolean | 
 
 #### Action Output
 DATA PATH | TYPE | CONTAINS | EXAMPLE VALUES

diff --git a/gsgmail.json b/gsgmail.json
@@ -16,10 +16,10 @@
     "latest_tested_versions": [
         "Cloud, May 26, 2023"
     ],
-    "app_version": "2.5.2",
+    "app_version": "2.5.3",
     "product_version_regex": ".*",
     "license": "Copyright (c) 2017-2024 Splunk Inc.",
-    "utctime_updated": "2024-03-18T08:57:36.000000Z",
+    "utctime_updated": "2024-04-05T12:12:46.000000Z",
     "configuration": {
         "login_email": {
             "required": true,
@@ -866,6 +866,12 @@
                     "data_type": "boolean",
                     "default": false,
                     "order": 2
+                },
+                "extract_nested": {
+                    "description": "Works when `extract_attachments` is set to `true`. Extracts attachments from nested email attachments.",
+                    "data_type": "boolean",
+                    "default": false,
+                    "order": 3
                 }
             },
             "output": [
@@ -1281,4 +1287,4 @@
             }
         ]
     }
-}
+}
diff --git a/gsgmail_connector.py b/gsgmail_connector.py
@@ -252,75 +252,6 @@ def _get_email_headers_from_part(self, part):
 
         return dict(headers)
 
-    def _parse_multipart_msg(self, action_result, msg, email_details, extract_attachments=False):
-        plain_bodies = []
-        html_bodies = []
-        container_id = self.get_container_id()
-
-        email_details['email_headers'] = []
-        for part in msg.walk():
-            part_type = part.get_content_type()
-            headers = self._get_email_headers_from_part(part)
-            # split out important headers (for output table rendering)
-            if headers.get('to'):
-                email_details['to'] = headers.get('to')
-
-            if headers.get('from'):
-                email_details['from'] = headers.get('from')
-
-            if headers.get('subject'):
-                email_details['subject'] = headers.get('subject')
-
-            disp = str(part.get('Content-Disposition'))
-            file_name = part.get_filename()
-            # look for plain text parts, but skip attachments
-            if part_type == 'text/plain' and 'attachment' not in disp:
-                charset = part.get_content_charset() or 'utf8'
-                # decode the base64 unicode bytestring into plain text
-                plain_body = part.get_payload(decode=True).decode(encoding=charset, errors="ignore")
-                # Add to list of plan text bodies
-                plain_bodies.append(plain_body)
-            if part_type == 'text/html' and 'attachment' not in disp:
-                charset = part.get_content_charset() or 'utf8'
-                # decode the base64 unicode bytestring into plain text
-                html_body = part.get_payload(decode=True).decode(encoding=charset, errors="ignore")
-                # Add to list of html bodies
-                html_bodies.append(html_body)
-            elif file_name and extract_attachments:
-                attach_resp = None
-                try:
-                    if part_type.startswith("message/"):
-                        content = part.get_payload(0).as_string()
-                    else:
-                        content = part.get_payload(decode=True)
-                    # Create vault item with attachment payload
-                    attach_resp = Vault.create_attachment(content, container_id=container_id, file_name=file_name)
-                except Exception as e:
-                    message = self._get_error_message_from_exception(e)
-                    return action_result.set_status(phantom.APP_ERROR, f"Unable to add attachment: {file_name} Error: {message}")
-                if attach_resp.get('succeeded'):
-                    # Create vault artifact
-                    artifact = {
-                        'name': 'Email Attachment Artifact',
-                        'container_id': container_id,
-                        'cef': {
-                            'vaultId': attach_resp[phantom.APP_JSON_HASH],
-                            'fileHash': attach_resp[phantom.APP_JSON_HASH],
-                            'file_hash': attach_resp[phantom.APP_JSON_HASH],
-                            'fileName': file_name
-                        },
-                        'run_automation': False,
-                        'source_data_identifier': None
-                    }
-                    ret_val, msg, _ = self.save_artifact(artifact)
-                    if phantom.is_fail(ret_val):
-                        return action_result.set_status(phantom.APP_ERROR, "Could not save artifact to container: {}".format(msg))
-            email_details['email_headers'].append(headers)
-        email_details['parsed_plain_body'] = '\n\n'.join(plain_bodies)
-        email_details['parsed_html_body'] = '\n\n'.join(html_bodies)
-
-        return phantom.APP_SUCCESS
-
     def _handle_run_query(self, param):
 
         # Implement the handler here, some basic code is already in
@@ -407,6 +338,148 @@ def _handle_run_query(self, param):
 
         return action_result.set_status(phantom.APP_SUCCESS)
 
+    def _body_from_part(self, part):
+        charset = part.get_content_charset() or "utf-8"
+        # decode the base64 unicode bytestring into plain text
+        return part.get_payload(decode=True).decode(
+            encoding=charset, errors="ignore"
+        )
+
+    def _create_artifact(self, file_name, attach_resp):
+        return {
+            "name": "Email Attachment Artifact",
+            "container_id": self.get_container_id(),
+            "cef": {
+                "vaultId": attach_resp[phantom.APP_JSON_HASH],
+                "fileHash": attach_resp[phantom.APP_JSON_HASH],
+                "file_hash": attach_resp[phantom.APP_JSON_HASH],
+                "fileName": file_name,
+            },
+            "run_automation": False,
+            "source_data_identifier": None,
+        }
+
+    def _parse_email_details(self, part, email_details):
+        headers = self._get_email_headers_from_part(part)
+        # split out important headers (for output table rendering)
+        if headers.get("to"):
+            email_details["to"] = headers["to"]
+
+        if headers.get("from"):
+            email_details["from"] = headers["from"]
+
+        if headers.get("subject"):
+            email_details["subject"] = headers["subject"]
+
+        part_type = part.get_content_type()
+        if part_type == "text/plain":
+            email_details["plain_bodies"].append(self._body_from_part(part))
+        elif part_type == "text/html":
+            email_details["html_bodies"].append(self._body_from_part(part))
+
+        email_details["email_headers"].append(headers)
+
+    def _get_payload_content(self, part):
+        if part.get_content_type().startswith("message/"):
+            return part.get_payload(0).as_string()
+        return part.get_payload(decode=True)
+
+    def _extract_attachment(self, part, action_result):
+        attach_resp = None
+        file_name = part.get_filename()
+        try:
+            # Create vault item with attachment payload
+            attach_resp = Vault.create_attachment(
+                self._get_payload_content(part),
+                container_id=self.get_container_id(),
+                file_name=file_name,
+            )
+        except Exception as e:
+            return action_result.set_status(
+                phantom.APP_ERROR,
+                f"Unable to add attachment: {file_name} Error: {self._get_error_message_from_exception(e)}",
+            )
+        if attach_resp.get("succeeded"):
+            # Create vault artifact
+            ret_val, msg, _ = self.save_artifact(
+                self._create_artifact(file_name, attach_resp)
+            )
+            if phantom.is_fail(ret_val):
+                return action_result.set_status(
+                    phantom.APP_ERROR,
+                    f"Could not save artifact to container: {msg}",
+                )
+        return phantom.APP_SUCCESS
+
+    @staticmethod
+    def _is_attachment(part):
+        return "attachment" in str(part.get("Content-Disposition"))
+
+    def _init_detail_fields(self, email_details):
+        email_details["plain_bodies"] = []
+        email_details["html_bodies"] = []
+        email_details["email_headers"] = []
+
+    def _join_email_bodies(self, email_details):
+        email_details["parsed_plain_body"] = "\n\n".join(
+            email_details.pop("plain_bodies")
+        )
+        email_details["parsed_html_body"] = "\n\n".join(
+            email_details.pop("html_bodies")
+        )
+
+    def __recursive_part_traverse(
+        self,
+        part,
+        email_details,
+        action_result,
+        extract_attachments=False,
+        extract_nested=False,
+        in_attachment=False,
+    ):
+        is_attachment = self._is_attachment(part)
+        # We are only gathering email data from top email, any attachment email should be omitted
+        if not is_attachment and not in_attachment:
+            self._parse_email_details(part, email_details)
+
+        ret_val = phantom.APP_SUCCESS
+
+        if is_attachment and extract_attachments:
+            ret_val = self._extract_attachment(part, action_result)
+            if phantom.is_fail(ret_val):
+                return ret_val
+
+        if not extract_nested and is_attachment:
+            return ret_val
+
+        if part.is_multipart():
+            for subpart in part.get_payload():
+                # We assume that everything that is under attachment is also an attachment
+                ret_val = ret_val and self.__recursive_part_traverse(
+                    subpart,
+                    email_details,
+                    action_result,
+                    extract_attachments,
+                    extract_nested,
+                    is_attachment or in_attachment,
+                )
+        return ret_val
+
+    def _parse_multipart_message(
+        self,
+        action_result,
+        msg,
+        email_details,
+        extract_attachments=False,
+        extract_nested=False,
+    ):
+        self._init_detail_fields(email_details)
+        ret_val = self.__recursive_part_traverse(
+            msg, email_details, action_result, extract_attachments, extract_nested
+        )
+        self._join_email_bodies(email_details)
+        return ret_val
+
     def _handle_get_email(self, param):
 
         self.save_progress("In action handler for: {0}".format(self.get_action_identifier()))
@@ -449,7 +522,13 @@ def _handle_get_email(self, param):
             msg = email.message_from_bytes(raw_encoded)
 
             if msg.is_multipart():
-                ret_val = self._parse_multipart_msg(action_result, msg, email_details_resp, param.get('extract_attachments', False))
+                ret_val = self._parse_multipart_message(
+                    action_result,
+                    msg,
+                    email_details_resp,
+                    param.get("extract_attachments", False),
+                    param.get("extract_nested", False),
+                )
 
                 if phantom.is_fail(ret_val):
                     return action_result.get_status()

diff --git a/release_notes/2.5.3.md b/release_notes/2.5.3.md
@@ -0,0 +1,4 @@
+* [PAPP-33478] Multipart message parsing improvement. 
+    * Implemented parsing nested attachments.
+    * Fixed email attachments overriding main email metadata.
+    * Added `extract_nested` action, which creates artifacts from attachments from nested email attachments. Works only when `extract_attachments` is set to `true`.