CrowdStrike BSOD Fix with AWS SSM Automation At Scale

During yesterdays events, I found several references for fixing EC2 Instances by attaching volumes to helper Instances and deleting the CrowdStrike file that was causing the issue. But unfortunately, this really doesn’t scale well. You’re limited to processing one broken Instance at a time per helper Instance you have running, and you have to try and keep track of what resources are available.

I made some changes from what I found, and have an SSM Automation Doc that spins up helper EC2 Instances per execution, allowing the process to scale much more quickly.


The SSM Automation Doc does the following:

  • Gathers information from the broken Instance including AZ and Subnet
    • The Helper Instance has to be in the same AZ as the EBS Volume you’re trying to fix
  • Spins up a new Windows Server EC2 Instance (defaults to 2022 base currently available, AMI may need to be changed as time progresses)
  • Shuts down the broken Instance, if not already shut down
  • Removes the Volume from the broken Instance and attaches it to the helper Instance
  • Sleeps for a bit to ensure that the Helper Instance is up and talking to SSM
    • If you’re running into timeouts on the RunCommand, bump this up
  • Deletes the CrowdStrike files
  • Removes Volume from Helper and re-attaches to source EC2 Instance
  • Starts the source EC2 Instance
  • Terminates the helper Instance


  • A Security Group with outbound rules to for
    • TCP/443 (HTTPS)
    • UDP/53 and TCP/53 (DNS)
  • An IAM Role with an Instance Profile to allow the helper Instance to reach SSM
  • The Instance ID
  • The Volume ID of the root volume
    • This probably can be retrieved from the Instance lookup, but initial attempts were error prone
    • Added benefit of this serving as a reference for what Volume ID belongs with what Instance in case you mess something up

How To Use

Create Automation Doc

Go to SSM -> Documents, select Create Document on the top right, and select Automation

Copy and paste the automation code below into the editor. Change any defaults you’d like for Roles/Security Groups

Execute Against Instance

Get the Instance ID and Root Volume ID from the Instance

Run the Automation Doc, providing both:

Wait for the automation doc to complete


The Automation will fail on RunCommand if the helper Instance is unable to communicate with SSM. This requires a valid IAM Role and network access to SSM.

The RunCommand step will also fail if the EC2 Instance has not finished starting up and connecting to SSM, if this happens increase the timeout of the Sleep stage to 2-3 minutes with PT2M or PT3M, respectively. Note that using smaller Instance Types as helper Instances (ie: t2.micro) is not recommended, they take longer to get going.

If the execution fails, you must reattach the original Volume to the original EC2 Instance if you want to execute the Automation Doc against that Instance again.

If the execution fails, terminate the helper Instance, or it will stick around. They are all conveniently named HelperInstance, and can be easily identified.

Automation Doc

description: Automation to fix CrowdStrike BSOD with Ephemeral Helper Instances
schemaVersion: '0.3'
    type: String
    description: The ID of the EC2 Instance to fix

    type: String
    description: The ID of the EBS volume to detach and reattach

    type: String
    default: c7i.large
    description: The type of the helper EC2 instance

    type: String
    default: ami-00d990e7e5ece7974
    description: The AMI ID of the helper EC2 Instance, defaults to Server 2022 base

    type: String
    default: AmazonSSMRoleForInstancesQuickSetup
    description: The IAM role to use for the SSM document

  # Recommend setting a default SG here
    type: String
    # default: sg-123456789132
    description: The ID of the security group to associate with the helper instance

    type: String
    default: PT1M
    description: How long to sleep to wait for the helper Instance to come up, default 1 minute

  - name: GetInstanceInfo
    action: aws:executeAwsApi
    nextStep: LaunchHelperInstance
    isEnd: false
      Service: ec2
      Api: DescribeInstances
        - '{{ InstanceId }}'
      - Name: AvailabilityZone
        Selector: $.Reservations[0].Instances[0].Placement.AvailabilityZone
        Type: String
      - Name: SubnetId
        Selector: $.Reservations[0].Instances[0].SubnetId
        Type: String

  - name: LaunchHelperInstance
    action: aws:executeAwsApi
    nextStep: WaitForHelperInstanceRunning
    isEnd: false
      Service: ec2
      Api: RunInstances
      ImageId: '{{ HelperInstanceAMI }}'
      InstanceType: '{{ HelperInstanceType }}'
      MinCount: 1
      MaxCount: 1
        Name: '{{ SSMRole }}'
        - DeviceName: /dev/sda1
            VolumeType: gp3
            Encrypted: true
        HttpTokens: required
        AvailabilityZone: '{{ GetInstanceInfo.AvailabilityZone }}'
      SubnetId: '{{ GetInstanceInfo.SubnetId }}'
        - '{{ SecurityGroupId }}'
        - ResourceType: instance
            - Key: Name
              Value: HelperInstance
      - Name: InstanceId
        Selector: $.Instances[0].InstanceId
        Type: String

  - name: WaitForHelperInstanceRunning
    action: aws:waitForAwsResourceProperty
    nextStep: StopInstance
    isEnd: false
      Service: ec2
      Api: DescribeInstances
        - '{{ LaunchHelperInstance.InstanceId }}'
      PropertySelector: $.Reservations[0].Instances[0].State.Name
        - running

  - name: StopInstance
    action: aws:changeInstanceState
    nextStep: WaitForInstanceStopped
    isEnd: false
        - '{{ InstanceId }}'
      DesiredState: stopped

  - name: WaitForInstanceStopped
    action: aws:waitForAwsResourceProperty
    nextStep: DetachVolume
    isEnd: false
      Service: ec2
      Api: DescribeInstances
        - '{{ InstanceId }}'
      PropertySelector: $.Reservations[0].Instances[0].State.Name
        - stopped

  - name: DetachVolume
    action: aws:executeAwsApi
    nextStep: WaitForVolumeAvailable
    isEnd: false
      Service: ec2
      Api: DetachVolume
      VolumeId: '{{ VolumeId }}'

  - name: WaitForVolumeAvailable
    action: aws:waitForAwsResourceProperty
    nextStep: AttachVolumeToHelper
    isEnd: false
      Service: ec2
      Api: DescribeVolumes
        - '{{ VolumeId }}'
      PropertySelector: $.Volumes[0].State
        - available

  - name: AttachVolumeToHelper
    action: aws:executeAwsApi
    nextStep: WaitForVolumeAttached
    isEnd: false
      Service: ec2
      Api: AttachVolume
      VolumeId: '{{ VolumeId }}'
      InstanceId: '{{ LaunchHelperInstance.InstanceId }}'
      Device: /dev/sdf

  - name: WaitForVolumeAttached
    action: aws:waitForAwsResourceProperty
    nextStep: Sleep
    isEnd: false
      Service: ec2
      Api: DescribeVolumes
        - '{{ VolumeId }}'
      PropertySelector: $.Volumes[0].Attachments[0].State
        - attached

  - name: Sleep
    action: aws:sleep
    nextStep: DeleteCrowdStrikeDriver
    isEnd: false
      Duration: '{{HelperSleep}}'

  - name: DeleteCrowdStrikeDriver
    action: aws:runCommand
    nextStep: DetachVolumeFromHelper
    isEnd: false
      DocumentName: AWS-RunPowerShellScript
        - '{{ LaunchHelperInstance.InstanceId }}'
          - |
            Remove-Item -Path "d:\Windows\System32\drivers\CrowdStrike\C-00000291*.sys" -Force

  - name: DetachVolumeFromHelper
    action: aws:executeAwsApi
    nextStep: WaitForVolumeAvailableAgain
    isEnd: false
      Service: ec2
      Api: DetachVolume
      VolumeId: '{{ VolumeId }}'

  - name: WaitForVolumeAvailableAgain
    action: aws:waitForAwsResourceProperty
    nextStep: AttachVolumeToOriginal
    isEnd: false
      Service: ec2
      Api: DescribeVolumes
        - '{{ VolumeId }}'
      PropertySelector: $.Volumes[0].State
        - available

  - name: AttachVolumeToOriginal
    action: aws:executeAwsApi
    nextStep: StartInstance
    isEnd: false
      Service: ec2
      Api: AttachVolume
      VolumeId: '{{ VolumeId }}'
      InstanceId: '{{ InstanceId }}'
      Device: /dev/sda1

  - name: StartInstance
    action: aws:changeInstanceState
    nextStep: WaitForInstanceRunning
    isEnd: false
        - '{{ InstanceId }}'
      DesiredState: running

  - name: WaitForInstanceRunning
    action: aws:waitForAwsResourceProperty
    nextStep: TerminateHelperInstance
    isEnd: false
      Service: ec2
      Api: DescribeInstances
        - '{{ InstanceId }}'
      PropertySelector: $.Reservations[0].Instances[0].State.Name
        - running

  - name: TerminateHelperInstance
    action: aws:changeInstanceState
    isEnd: true
        - '{{ LaunchHelperInstance.InstanceId }}'
      DesiredState: terminated

Generating Commands

WARNING - Only 100 concurrent automation executions are allowed by default in an AWS Account. If you have more Instances than that, you need to batch them and wait for batches to complete before running new ones

Instead of manually looking up Instance IDs and Volume IDs, I recommend generating the commands to run them. There are definitely more robust ways to script and automate all this, but time is short, and this is simple.

Here’s a basic TypeScript script that takes a list of Instance IDs you provide (at the top) and generates SSM Commands for all of them. Bash, Python, or anything else preferred can be used instead. The Instance IDs can be retrieved from reports in AWS or other systems.

import { EC2Client, DescribeInstancesCommand } from "@aws-sdk/client-ec2"
import { GetCallerIdentityCommand, STSClient } from '@aws-sdk/client-sts'
import * as fs from 'fs'

const REGION = 'us-east-1'
// Change to the document name in AWS
const CROWDSTRIKE_AUTO_DOC_NAME = 'CrowdStrikeFix'

// Instance IDs to generate commands for
const instanceIds = [

// Setup AWS Clients
const ec2Client = new EC2Client( {
    region: REGION,
} )

const stsClient = new STSClient( {
    region: REGION,
} )

// Get a list of all AWS Instances
async function listInstances() {
    const instances = await ec2Client.send( new DescribeInstancesCommand( {} ) )
    return instances.Reservations!

async function main() {
    const instances = await listInstances()

    const commands: string[] = []

    for ( let reservation of instances ) {
        for ( let instance of reservation.Instances! ) {
            if ( instanceIds.includes( instance.InstanceId! ) ) {
                const rootVolume = instance.BlockDeviceMappings!.find( mapping => mapping.DeviceName === instance.RootDeviceName )
                if ( rootVolume ) {
                    console.log(`Generated Command for ${instance.InstanceId!} - ${rootVolume.Ebs!.VolumeId!}`)

                    const command = `aws ssm start-automation-execution --document-name "${CROWDSTRIKE_AUTO_DOC_NAME}" --parameters "InstanceId=${instance.InstanceId!},VolumeId=${rootVolume.Ebs!.VolumeId!}" --region ${REGION}`
                    commands.push( command )

    // Account ID used for file name
    const accountId = ( await stsClient.send( new GetCallerIdentityCommand( {} ) ) ).Account!

    const filePath = `${accountId}`
    fs.writeFileSync( filePath, commands.join( '\n' ) )

    console.log( `AWS CLI commands written to ${filePath}` )

main().then( () => {
    console.log( 'done' )
} )


A file called <accountId> will be created in the directory, that looks like:

aws ssm start-automation-execution --document-name "CrowdStrikeFix" --parameters "InstanceId=i-0b56bf9865e747d30,VolumeId=vol-021a9e2a04b9d2888" --region us-east-1