#!/bin/bash
# ex:ts=4:sw=4:sts=4:et
# -*- tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*-
#
# Copyright (c) 2020 Mellanox Technologies. All rights reserved.
#
# This Software is licensed under one of the following licenses:
#
# 1) under the terms of the "Common Public License 1.0" a copy of which is
#    available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/cpl.php.
#
# 2) under the terms of the "The BSD License" a copy of which is
#    available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/bsd-license.php.
#
# 3) under the terms of the "GNU General Public License (GPL) Version 2" a
#    copy of which is available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/gpl-license.php.
#
# Licensee has the right to choose one of the above licenses.
#
# Redistributions of source code must retain the above copyright
# notice and one of the license notices.
#
# Redistributions in binary form must reproduce both the above copyright
# notice, one of the license notices in the documentation
# and/or other materials provided with the distribution.

PATH=/opt/mellanox/iproute2/sbin:/opt/mellanox/ethtool/sbin:/bin:/sbin:/usr/bin:/usr/sbin
CT_MAX_OFFLOADED_CONNS=${CT_MAX_OFFLOADED_CONNS:-1000000}
MLXCONFIG_TIMEOUT=${MLXCONFIG_TIMEOUT:-60}
RDMA_SET_NETNS_TIMEOUT=${RDMA_SET_NETNS_TIMEOUT:-30}
SET_MODE_RETRY_NUM=${SET_MODE_RETRY_NUM:-60}
RUN_CMD_RETRY_NUM=${RUN_CMD_RETRY_NUM:-10}
SUPPORTED_DEVICES="a2d[26cf]|1025"
BLUEFIELD_DEVICES="a2d[26cf]"

ipsec_services="strongswan.service ipsec.service"
ipsec_was_active=""

RC=0

info()
{
	logger -t $prog -i "INFO: $*"
}

error()
{
	logger -t $prog -i "ERR: $*"
}

debug()
{
	if [ "X${LOG_LEVEL}" == "Xdebug" ]; then
		logger -t $prog -i "DBG: $*"
	fi
}

is_bf=`lspci -s 00:00.0 2> /dev/null | grep -wq "PCI bridge: Mellanox Technologies" && echo 1 || echo 0`
if [ $is_bf -ne 1 ]; then
	# Check if the device is a Mellanox BlueField 4 or newer
	if [ -e /etc/mlnx-release ]; then
		if [ $(lspci -nD -d 15b3: | grep -E "$BLUEFIELD_DEVICES" | wc -l) -gt 0 ]; then
			info "Device is a Mellanox BlueField 4 or newer"
		else
			exit 0
		fi
	else
		info "Device is not a Mellanox BlueField. Exiting..."
		exit 0
	fi
fi

prog=`basename $0`

PID=$(pgrep -oxf "/bin/bash /sbin/$prog" \
        || pgrep -oxf "/bin/bash /usr/sbin/$prog" \
        || pgrep -oxf "/usr/bin/bash /sbin/$prog" \
        || pgrep -oxf "/usr/bin/bash /usr/sbin/$prog")
if [[ -n $PID && $$ -ne $PID ]]; then
	# $prog is running already with PID: $PID
	exit 0
fi

run_command_with_retry()
{
	cmd=$1
	shift

	i=0
	while ! (eval "$cmd"); do
		if [ $i -gt $RUN_CMD_RETRY_NUM ]; then
			error "Failed to run $cmd after $i retries"
			return 1
		fi
		let i++
		sleep 1
	done
	if [ $i -gt 0 ]; then
		info "$cmd passed after $i retries"
	fi

	return 0
}

get_steering_mode()
{
	pci_dev=$1
	shift

	compat=`/bin/ls -1 /sys/bus/pci/devices/${pci_dev}/net/*/compat/*/steering_mode 2> /dev/null`
	if [ -n "$compat" ]; then
		cat ${compat} 2> /dev/null
	else
		run_devlink dev param show pci/${pci_dev} name flow_steering_mode | tail -1 | awk '{print $NF}'
	fi
}

set_steering_mode()
{
	pci_dev=$1
	mode=$2
	shift 2

	rc=1
	i=0
	compat=`/bin/ls -1 /sys/bus/pci/devices/${pci_dev}/net/*/compat/*/steering_mode 2> /dev/null`
	while [ $rc -ne 0 ]
	do
		if [ -n "$compat" ]; then
			run_command_with_retry "echo ${mode} > /sys/bus/pci/devices/${pci_dev}/net/*/compat/*/steering_mode"
		else
			run_command_with_retry "run_devlink dev param set pci/${pci_dev} name flow_steering_mode value ${mode} cmode runtime"
		fi
		rc=$?
		let i++
		if [ $i -gt $SET_MODE_RETRY_NUM ]; then
			break
		fi
		sleep 1
	done
	if [ $rc -ne 0 ]; then
		error "Failed to configure steering mode ${mode} for ${pci_dev} after $i retries"
	else
		info "Configured mode steering ${mode} for ${pci_dev} on try: $i"
	fi

	return $rc
}

get_eswitch_mode()
{
	pci_dev=$1
	shift

	compat=`/bin/ls -1 /sys/bus/pci/devices/${pci_dev}/net/*/compat/*/mode 2> /dev/null`
	if [ -n "$compat" ]; then
		cat ${compat}
	else
		run_devlink dev eswitch show pci/${pci_dev} 2> /dev/null | cut -d ' ' -f 3
	fi
}

set_eswitch_mode()
{
	pci_dev=$1
	mode=$2
	shift 2

	rc=1
	i=0
	compat=`/bin/ls -1 /sys/bus/pci/devices/${pci_dev}/net/*/compat/*/mode 2> /dev/null`
	while [ $rc -ne 0 ]
	do
		if [ -n "$compat" ]; then
			echo ${mode} > ${compat}
		else
			run_devlink dev eswitch set pci/${pci_dev} mode ${mode}
		fi
		rc=$?
		let i++
		if [ $i -gt $SET_MODE_RETRY_NUM ]; then
			break
		fi
		sleep 1
	done
	if [ $rc -ne 0 ]; then
		error "Failed to configure ${mode} mode for ${pci_dev} after $i retries"
	else
		info "Configured ${mode} mode for ${pci_dev} on try: $i"
	fi

	return $rc
}

get_dev_param()
{
	pci_dev=$1
	name=$2
	shift 2
	compat=`/bin/ls -1 /sys/bus/pci/devices/${pci_dev}/net/*/compat/*/${name} 2> /dev/null`
	if [ -n "$compat" ]; then
		cat ${compat} 2> /dev/null
	else
		run_devlink dev param show pci/${pci_dev} name ${name} 2> /dev/null | tail -1 | awk '{print $NF}'
	fi
}

set_dev_param()
{
	pci_dev=$1
	name=$2
	value=$3
	msg=$4
	shift 4

	compat=`/bin/ls -1 /sys/bus/pci/devices/${pci_dev}/net/*/compat/*/${name} 2> /dev/null`
	if [ -n "$compat" ]; then
		run_command_with_retry "echo ${value} > ${compat}"
	else
		run_command_with_retry "run_devlink dev param set pci/${pci_dev} name ${name} value ${value} cmode runtime"
	fi
	rc=$?
	if [ $rc -ne 0 ]; then
		error "${msg} Failed to set parameter ${name} to ${value} value for ${pci_dev}"
	else
		info "${msg} Set ${name} parameter to ${value} value for ${pci_dev}"
	fi

	return $rc
}

run_mlxconfig()
{
	cmd="$*"
	mft_output=""

	start_time=$(awk '{print $1}' /proc/uptime)
	mft_output="$($cmd 2>&1)"
	while (echo "$mft_output" | grep -q "Failed to open"); do
		echo "$mft_output" >> ${MLXCONFIG_DEBUG_LOG}
		elapsed=$(awk "BEGIN {printf \"%.0f\", $(awk '{print $1}' /proc/uptime) - $start_time}")
		if [ $elapsed -gt $MLXCONFIG_TIMEOUT ]; then
			error "Failed to run $cmd"
			exit 1
		fi
		sleep 1
		export MLXCONFIG_DEBUG=1
		mft_output=$($cmd 2>&1)
		export MLXCONFIG_DEBUG=0
	done
	echo "$mft_output"
}

run_devlink()
{
	cmd="$*"
	output=$(devlink $cmd 2>&1)
	if [ $? -ne 0 ]; then
		error "Failed to run devlink $cmd: $output"
		if command -v mlxdevm > /dev/null 2>&1; then
			output=$(mlxdevm $cmd 2>&1)
			if [ $? -ne 0 ]; then
				error "Failed to run mlxdevm $cmd: $output"
				return 1
			fi
		fi
		return 1
	fi
	echo $output
}

is_port_ib()
{
	dev=$1
	port=$2
	shift 2

	mft_output=$(run_mlxconfig "$mftconfig -d ${dev} -e q LINK_TYPE_P$port")

	if (echo "$mft_output" | grep -o LINK_TYPE_P.* | awk '{print $3}' | grep -q "IB(1)"); then
		return 0
	fi
	return 1
}

# Check if the device is in Socket Direct mode
is_sd_mode()
{
	dev=$1
	shift 1

	mft_output=$(run_mlxconfig "$mftconfig -d ${dev} -e q PF_SD_GROUP")

	if (echo "$mft_output" | grep -o PF_SD_GROUP.* | awk '{print$3}'  | grep -q "^1$"); then
		return 0
	fi
	return 1
}

is_supported_device()
{
	dev=$1
	shift 1

	if (lspci -nD -s ${dev} | grep -E "$SUPPORTED_DEVICES"); then
		return 0
	fi
	return 1
}

is_smartnic_mode()
{
	dev=$1
	shift 1

	mft_output=$(run_mlxconfig "$mftconfig -d ${dev} -e q INTERNAL_CPU_MODEL")

	if (echo $mft_output | grep -o INTERNAL_CPU_MODEL.* | awk '{print$3}' | grep -q "EMBEDDED_CPU(1)"); then
		info "Device ${dev} is in SmartNIC mode"
		return 0
	elif (echo $mft_output | grep -o INTERNAL_CPU_MODEL.* | awk '{print$3}' | grep -q "SEPARATED_HOST(0)"); then
		info "Device ${dev} is in SEPERATED_HOST mode"
		return 1
	fi
	# Assume SmartNIC mode if no match is found
	info "Device ${dev} is in SmartNIC mode"
	return 0
}

get_mlnx_netdevs_by_slot()
{
	dev_num=$1
	shift 1

	pci_id=$(lspci -nD -d 15b3: | grep -E "$SUPPORTED_DEVICES" | cut -d ' ' -f 1 | head -${dev_num} | tail -1)
	netdevs=$(grep PCI_SLOT_NAME=${pci_id} /sys/class/net/*/device/uevent | cut -d '/' -f 5)
	if [ -n "$netdevs" ]; then
		echo "$netdevs"
	fi

	return 0
}

get_mlnx_netdevs_by_pci_id()
{
	pci_id=$1
	shift 1

	netdevs=$(grep PCI_SLOT_NAME=${pci_id} /sys/class/net/*/device/uevent | cut -d '/' -f 5)
	if [ -n "$netdevs" ]; then
		echo "$netdevs"
	fi

	return 0
}

# Helper: check if a value is in an array
in_array() {
    local needle=$1; shift
    local elem
    for elem in "$@"; do
        [[ "$elem" == "$needle" ]] && return 0
    done
    return 1
}

stop_ipsec_services()
{
	# Stop IPsec-related services temporarily if active to remove offloaded IPsec bypass policies
	# This is done so we can change the eswitch mode
	for svc in $ipsec_services; do
		if systemctl is-active --quiet $svc; then
			[ -z "$ipsec_was_active" ] && ipsec_was_active=$svc || ipsec_was_active="$ipsec_was_active $svc"
		fi
	done
	for svc in $ipsec_was_active; do
		systemctl stop $svc
	done
	for svc in $ipsec_was_active; do
		if systemctl is-active --quiet $svc; then
			error "Failed to stop $svc"
		else
			info "Stopped $svc successfully"
		fi
	done
}

start_ipsec_services()
{
	# Start IPsec-related services again if they were active
	for svc in $ipsec_was_active; do
		systemctl start $svc
		if systemctl is-active --quiet $svc; then
			info "Started $svc successfully"
		else
			error "Failed to start $svc"
		fi
	done
}

pci_id=$(lspci -nD -d 15b3: | grep -E "$BLUEFIELD_DEVICES" | cut -d ' ' -f 1 | head -1)
host_en=$(mlxreg -d ${pci_id} --get --reg_name MMHI | grep host_en | awk '{print $NF}')

decimal=$(( host_en ))
is_power_of_two=$(( (decimal & (decimal - 1)) == 0 ))

if [[ $decimal != 0 && $is_power_of_two == 1 ]]; then
	info "Detected Controller device. Exiting..."
	exit 0
fi

is_SecureBoot=0
if (mokutil --sb-state 2>&1 | grep -q "SecureBoot enabled"); then
	is_SecureBoot=1
fi

if [ $is_SecureBoot -eq 1 ]; then
	mst_dev=`/bin/ls -1 /dev/mst/mt*_pciconf0 2> /dev/null`
	if [ ! -n "${mst_dev}" ]; then
		mst start > /dev/null 2>&1
	fi
fi

mftconfig=mstconfig
if [ -x /usr/bin/mlxconfig ]; then
	mftconfig=mlxconfig
fi

if [ -f /etc/mellanox/mlnx-bf.conf ]; then
	. /etc/mellanox/mlnx-bf.conf
fi
IPSEC_FULL_OFFLOAD=${IPSEC_FULL_OFFLOAD:-"no"}
LAG_HASH_MODE=${LAG_HASH_MODE:-"yes"}
ENABLE_ESWITCH_MULTIPORT=${ENABLE_ESWITCH_MULTIPORT:-"no"}
MLXCONFIG_DEBUG_LOG=${MLXCONFIG_DEBUG_LOG:-"/tmp/mlxconfig_debug.log"}
LOG_LEVEL=${LOG_LEVEL:-"info"}
SWITCHDEV_DEVICES_LIST=()
SD_DEVICES_LIST=()
DEVICE_LIST_FOR_OVS_BRIDGES=()

RDMA_SET_NETNS_EXCLUSIVE=${RDMA_SET_NETNS_EXCLUSIVE:-"yes"}
if [ "X${RDMA_SET_NETNS_EXCLUSIVE}" == "Xyes" ]; then
	if ! (rdma system show netns 2>&1 | grep -q exclusive); then
		start_time=$(awk '{print $1}' /proc/uptime)
		while ! (rdma system set netns exclusive); do
			elapsed=$(awk "BEGIN {printf \"%.0f\", $(awk '{print $1}' /proc/uptime) - $start_time}")
			if [ $elapsed -gt $RDMA_SET_NETNS_TIMEOUT ]; then
				break
			fi
			sleep 1
		done
	fi

	if (rdma system show netns 2>&1 | grep -q exclusive); then
		info "The RDMA subsystem is set in network namespace exclusive mode."
	else
		error "Failed to set rdma exclusive mode"
	fi
fi

ctrl_dev=$(lspci -nD -d 15b3: | grep -E "$BLUEFIELD_DEVICES" | cut -d ' ' -f 1 | head -1)
num_of_supported_devs=$(lspci -nD -d 15b3: | grep -E "$SUPPORTED_DEVICES" | wc -l)

start_time=$(awk '{print $1}' /proc/uptime)
mft_output=$($mftconfig -d $ctrl_dev -e q INTERNAL_CPU_MODEL 2>&1)
if ! (echo "$mft_output" | grep -q 'E- The Device doesn.*t support INTERNAL_CPU_MODEL parameter'); then
	while (echo "$mft_output" | grep -q "Failed to open"); do
		echo "$mft_output" >> ${MLXCONFIG_DEBUG_LOG}
		elapsed=$(awk "BEGIN {printf \"%.0f\", $(awk '{print $1}' /proc/uptime) - $start_time}")
		if [ $elapsed -gt $MLXCONFIG_TIMEOUT ]; then
			error "Failed to run $mftconfig query on $ctrl_dev after $elapsed seconds"
			exit 1
		fi
		sleep 1
		mft_output=$($mftconfig -d $ctrl_dev -e q INTERNAL_CPU_MODEL 2>&1)
	done
else
	info "Device $ctrl_dev does not support INTERNAL_CPU_MODEL parameter"
fi
info "MFT output: $mft_output"

num_of_devs_switchdev=0
# Set eswitch mode to switchdev for all devices
if (is_smartnic_mode "$ctrl_dev"); then
	stop_ipsec_services
	for dev in $(devlink dev show | awk -F'/' '/^pci\//{gsub(/:$/,"",$2); print $2}')
	do
		port=$(( ${dev: -1} + 1 ))
		if (is_port_ib "$dev" "$port"); then
			info "Link type is IB for ${dev}. Skipping mode confiugration."
			continue
		fi

		if [ "X${LAG_HASH_MODE}" == "Xno" ]; then
			set_dev_param ${dev} lag_port_select_mode queue_affinity
		elif [ "X${LAG_MULTIPORT_ESW_MODE}" == "Xyes" ]; then
			eswitch_mode=`get_eswitch_mode ${dev}`
			if [ "${eswitch_mode}" != "legacy" ]; then
				set_eswitch_mode ${dev} legacy
				RC=$((RC+$?))
			fi

			set_dev_param ${dev} lag_port_select_mode multiport_esw
		fi

		if [ "X${ENCAP_NONE_MODE}" == "Xyes" ]; then
			eswitch_mode=`get_eswitch_mode ${dev}`
			if [ "${eswitch_mode}" != "legacy" ]; then
				set_eswitch_mode ${dev} legacy
				RC=$((RC+$?))
			fi

			set_dev_param ${dev} encap none
		fi

		steering_mode=`get_steering_mode ${dev}`
		if [ "${steering_mode}" == "dmfs" ]; then
			eswitch_mode=`get_eswitch_mode ${dev}`
			if [ "${eswitch_mode}" != "legacy" ]; then
				set_eswitch_mode ${dev} legacy
				RC=$((RC+$?))
			fi

			set_steering_mode ${dev} smfs
		fi

		if [ "${IPSEC_FULL_OFFLOAD}" == "yes" ]; then
			lscpu | grep Flags | grep sha1 | grep sha2 | grep -q aes
			if [ $? -eq 0 ]; then
				eswitch_mode=`get_eswitch_mode ${dev}`
				if [ "${eswitch_mode}" != "legacy" ]; then
					set_eswitch_mode ${dev} legacy
					RC=$((RC+$?))
				fi
				steering_mode=`get_steering_mode ${dev}`
				if [ "${steering_mode}" == "smfs" ]; then
					set_steering_mode ${dev} dmfs
				fi
			else
				info "Crypto disabled on this devide. Skipping IPsec mode configuration."
			fi
		fi

		eswitch_mode=`get_eswitch_mode ${dev}`
		if [ "${eswitch_mode}" != "switchdev" ]; then
			if [ "X${LEGACY_METADATA_MATCH_MODE}" == "Xyes" ]; then
				set_dev_param ${dev} vport_match_mode legacy
			fi

			set_eswitch_mode ${dev} switchdev
			RC=$((RC+$?))

			set_dev_param ${dev} ct_max_offloaded_conns ${CT_MAX_OFFLOADED_CONNS}
		fi
		eswitch_mode=`get_eswitch_mode ${dev}`
		if [ "${eswitch_mode}" == "switchdev" ]; then
			if [ "X${ENABLE_ESWITCH_MULTIPORT}" == "Xyes" ]; then
				set_dev_param ${dev} esw_multiport  1 "ESW Multiport:"
			fi
			num_of_devs_switchdev=$((num_of_devs_switchdev+1))
			SWITCHDEV_DEVICES_LIST+=( "$dev" )
		fi
		if is_sd_mode ${dev}; then
			debug "Adding SD device $dev to SD_DEVICES_LIST"
			SD_DEVICES_LIST+=( "$dev" )
		fi
		if is_supported_device ${dev}; then
			debug "Adding supported device $dev to DEVICE_LIST_FOR_OVS_BRIDGES"
			DEVICE_LIST_FOR_OVS_BRIDGES+=( "$dev" )
		fi
	done
	start_ipsec_services
fi

debug "num_of_devs_switchdev: $num_of_devs_switchdev"
debug "SWITCHDEV_DEVICES_LIST: ${SWITCHDEV_DEVICES_LIST[*]}"
debug "SD_DEVICES_LIST: ${SD_DEVICES_LIST[*]}"
debug "DEVICE_LIST_FOR_OVS_BRIDGES: ${DEVICE_LIST_FOR_OVS_BRIDGES[*]}"

if [ "$(stat -c %d:%i /)" != "$(stat -c %d:%i /proc/1/root/.)" ]; then
	info "Running in chroot environment. Exiting..."
	exit 0
fi

if [ $RC -ne 0 ]; then
	error "Exiting due to failures. RC=$RC"
	exit $RC
fi

if [ $num_of_devs_switchdev -eq 0 ]; then
	info "No devices configured to switchdev mod. Skipping SF/Bridges configuration."
	exit 0
fi

if [ -f /etc/mellanox/mlnx-sf.conf ]; then
	. /etc/mellanox/mlnx-sf.conf
fi

HUGEPAGES_TOOL=/usr/sbin/doca-hugepages
OVS_DEFAULT_HUGEPAGE_SIZE=2048
OVS_DEFAULT_HUGEPAGE_NUM=512

add_default_hugepages_configurations()
{
	if [ "X${OVS_DOCA}" != "Xyes" ]; then
		return
	fi
	[ -z "$OVS_HUGEPAGE_SIZE" ] && OVS_HUGEPAGE_SIZE=$OVS_DEFAULT_HUGEPAGE_SIZE
	[ -z "$OVS_HUGEPAGE_NUM" ] && OVS_HUGEPAGE_NUM=$OVS_DEFAULT_HUGEPAGE_NUM
	info "Adding ovs-doca default hugepages configuration"
	$HUGEPAGES_TOOL config --force --app "ovs-doca (default)" --size $OVS_HUGEPAGE_SIZE --num $OVS_HUGEPAGE_NUM
}

config_hugepages()
{
	add_default_hugepages_configurations
	info "Applying hugepages configuration"
	$HUGEPAGES_TOOL reload
}

cleanup_hugepages()
{
	if [ "X${OVS_DOCA}" != "Xyes" ] && $HUGEPAGES_TOOL show | grep -q "ovs-doca"; then
		info "Removing ovs-doca default hugepages configuration"
		$HUGEPAGES_TOOL remove ovs-doca
		$HUGEPAGES_TOOL reload
	fi
}

if [ -f /etc/mellanox/mlnx-ovs.conf ]; then
	. /etc/mellanox/mlnx-ovs.conf
fi

# Configure hugepages
config_hugepages

# Configure the default OVS bridge
vsctl=`which ovs-vsctl 2> /dev/null`
if [ ! -n "$vsctl" ]; then
	info "OVS is not installed. Skipping OVS bridges creation."
	exit 0
fi

max_num_of_ovs_bridges=$num_of_devs_switchdev
max_num_of_sd_ovs_bridges=0
num_of_sd_devs=${#SD_DEVICES_LIST[@]}
if [ $num_of_sd_devs -gt 0 ]; then
	if [ $num_of_sd_devs -eq 1 ]; then
		max_num_of_sd_ovs_bridges=1
	else
		max_num_of_sd_ovs_bridges=2
	fi
fi

debug "max_num_of_ovs_bridges: $max_num_of_ovs_bridges"
debug "max_num_of_sd_ovs_bridges: $max_num_of_sd_ovs_bridges"

num_of_sd_ovs_bridges=0
num_of_ovs_bridges=0
i=1
for dev in ${DEVICE_LIST_FOR_OVS_BRIDGES[@]}; do
	sd_dev=""
	if [ $num_of_sd_devs -gt 0 ]; then
		if in_array $dev ${SD_DEVICES_LIST[@]}; then
			if [ $num_of_sd_ovs_bridges -lt $max_num_of_sd_ovs_bridges ]; then
				num_of_sd_ovs_bridges=$((num_of_sd_ovs_bridges+1))
			else
				continue
			fi
			sd_dev=$dev
		fi
	fi
	bridge_var=OVS_BRIDGE${i}
	ports_var=OVS_BRIDGE${i}_PORTS
	idx=$((i-1))

	# Set default bridge name if not already set
	if [ -z "${!bridge_var}" ]; then
		eval "${bridge_var}=ovsbr${i}"
	fi

	# Set default ports if not already set
	if [ -z "${!ports_var}" ]; then
		eval "${ports_var}=\"$(get_mlnx_netdevs_by_pci_id $dev)\""
		# Workaround: Add extra netdev
		if [ "$dev" == "$sd_dev" ]; then
			num_of_ports=$(wc -w <<< "$ports_var")
			if [ $num_of_ports -lt 4 ]; then
				# Add extra netdev as a workaround for SD device
				# increase the last digit of the dev name by 2
				new_dev_name=${dev:0:-1}$(( ${dev: -1} + 2 ))
				extra_netdevs=$(get_mlnx_netdevs_by_pci_id $new_dev_name)
				# Choose netdedv that has "<string><num>pf<num><string>" in its name
				extra_netdev=$(echo "$extra_netdevs" | grep -oE '[[:alnum:]]+[0-9]pf[0-9][[:alnum:]]+')
				if [ -n "$extra_netdev" ]; then
					debug "Adding extra netdev $extra_netdev to $ports_var"
					eval "${ports_var}=\"${!ports_var} $extra_netdev\""
				fi
			fi
		fi
	fi
	i=$((i+1))
	num_of_ovs_bridges=$((num_of_ovs_bridges+1))
done

OVS_HW_OFFLOAD=${OVS_HW_OFFLOAD:-"yes"}
OVS_TIMEOUT=${OVS_TIMEOUT:-30}

ovs_service=""
if [ -e /etc/init.d/openvswitch ]; then
	ovs_service="openvswitch.service"
elif [ -e /usr/lib/systemd/system/openvswitch.service ]; then
	ovs_service="openvswitch.service"
else
	ovs_service=`systemctl list-unit-files 2> /dev/null | grep -E "openvswitch.service|openvswitch.service" | awk '{print $1}'`
fi

if ! (systemctl is-enabled $ovs_service 2> /dev/null | grep -wq enabled); then
	# OVS service is not enabled
	info "$ovs_service is not enabled. Exiting..."
	exit 0
fi

ovs_restart="systemctl restart $ovs_service"

need_restart=false

ovs_restart()
{
	if [ "$need_restart" == "false" ]; then
		info "Restarting of $ovs_service is not required"
		return 0
	fi

	if [ -n "$ovs_service" ]; then
		info "Restarting $ovs_service"
		$ovs_restart
		need_restart=false
	fi

	# Re-apply udev settings removed by OVS
	if [ -x /lib/udev/mlnx_bf_udev ]; then
		for p in $(cd /sys/class/net; /bin/ls -d *)
		do
			case "$p" in
				p*|e*)
				/lib/udev/mlnx_bf_udev $p > /dev/null 2>&1
				;;
				*)
				;;
			esac
		done
	fi

	# Enable hw-tc-offload for all bridge ports dynamically
	for i in `seq $num_of_devs_switchdev`
	do
		br_ports=OVS_BRIDGE${i}_PORTS
		br_ports=${!br_ports}
		for p in $br_ports
		do
			ethtool -K $p hw-tc-offload on
		done
	done
}

{
start_time=$(awk '{print $1}' /proc/uptime)
while ! ($vsctl show > /dev/null 2>&1)
do
	elapsed=$(awk "BEGIN {printf \"%.0f\", $(awk '{print $1}' /proc/uptime) - $start_time}")
	if [ $elapsed -gt $OVS_TIMEOUT ]; then
		info "$ovs_service is not up. Exiting..."
		exit 1
	fi
	sleep 1
done

ovs_config_default_datapath()
{
	ovs_default_datapath=${1:-"system"}

	if !($vsctl get Open_vSwitch . Other_config 2> /dev/null | grep -q "default-datapath-type=$ovs_default_datapath"); then
		$vsctl --no-wait set Open_vSwitch . Other_config:default-datapath-type=$ovs_default_datapath 2> /dev/null
		if [ $? -eq 0 ]; then
			info "OVS default-datapath-type is set"
		fi
	fi

	if !($vsctl get Open_vSwitch . Other_config 2> /dev/null | grep -q "default-datapath-type=$ovs_default_datapath"); then
		error "Failed setting OVS default-datapath-type"
		return 1
	fi
}

ovs_cleanup_doca()
{
	if ($vsctl get Open_vSwitch . Other_config 2> /dev/null | grep -q 'default-datapath-type'); then
		info "OVS cleanup default-datapath-type"
		$vsctl --no-wait remove Open_vSwitch . other_config default-datapath-type
	fi

	if ($vsctl get Open_vSwitch . Other_config 2> /dev/null | grep -q 'doca-init'); then
		info "OVS cleanup doca-init"
		$vsctl --no-wait remove Open_vSwitch . other_config doca-init
		need_restart=true
	fi
}

ovs_config_doca()
{
	local doca_initialized=`$vsctl get Open_vSwitch . doca_initialized`

	if [ "X${OVS_DOCA}" != "Xyes" ]; then
		ovs_cleanup_doca
		return 0
	fi

	if [ "$doca_intialized" == "true" ]; then
		return 0
	fi

	$vsctl --no-wait set Open_vSwitch . Other_config:doca-init="true" 2> /dev/null
	if ($vsctl get Open_vSwitch . Other_config 2> /dev/null | grep -q 'doca-init="true"'); then
		info "OVS doca-init is set to true"
		need_restart=true
	else
		info "OVS failed setting doca-init to true"
	fi

	ovs_config_default_datapath netdev
}

is_ovs_config_valid()
{
	# Check that all OVS bridges ports exist
	for br in `$vsctl list-br`; do
		for port in `$vsctl list-ports $br`; do
			if ! [ -d /sys/class/net/$port ]; then
				info "Port $port for bridge $br does not exist. OVS configuration is invalid."
				return 1
			fi
		done
	done
	return 0
}

ovs_remove_bridges()
{
	for br in `$vsctl list-br`; do
		$vsctl del-br $br
		debug "Removed bridge $br"
	done
}

#Enable OVS-DOCA
ovs_config_doca

#Cleanup hugepage if neccessary
cleanup_hugepages

CREATE_OVS_BRIDGES=${CREATE_OVS_BRIDGES:-"yes"}
if [ "X${CREATE_OVS_BRIDGES}" != "Xyes" ]; then
    debug "CREATE_OVS_BRIDGES is not set to yes. Skipping OVS bridges creation."
    if [ "$need_restart" == true ]; then
		ovs_restart
    fi
	exit $RC
fi

ovsbr_number=`$vsctl list-br | wc -l`
if [ $ovsbr_number -gt 0 ]; then
	if is_ovs_config_valid; then
		debug "OVS bridges already exist and configuration is valid. Skipping OVS bridges creation."
		if ($vsctl get Open_vSwitch . Other_config 2> /dev/null | grep 'hw-offload="true"') || [ "$need_restart" == true ]; then
			ovs_restart
		fi
		exit $RC
	else
		info "Removing OVS bridges and creating new ones."
		ovs_remove_bridges
		ovsbr_number=0
	fi
fi

OVS_BR_PORTS_TIMEOUT=${OVS_BR_PORTS_TIMEOUT:-30}
for i in `seq $num_of_ovs_bridges`
do
	br_name=OVS_BRIDGE${i}
	br_name=${!br_name}
	br_ports=OVS_BRIDGE${i}_PORTS
	br_ports=${!br_ports}

	if ($vsctl br-exists $br_name); then
		info "bridge $br_name exist already."
		if ! [[ $(uname -r) =~ "4.19" ]]; then
			ip link set $br_name up
		fi
		continue
	fi

	missing_port=0
	ovs_br_ports=""
	for port in $br_ports
	do
		start_time=$(awk '{print $1}' /proc/uptime)
		while ! [ -d /sys/class/net/$port ]
		do
			elapsed=$(awk "BEGIN {printf \"%.0f\", $(awk '{print $1}' /proc/uptime) - $start_time}")
			if [ $elapsed -gt $OVS_BR_PORTS_TIMEOUT ]; then
				break
			fi
			sleep 1
		done

		if [ -d /sys/class/net/$port ]; then
			ovs_br_ports="$ovs_br_ports $port"
		else
			info "port device $port for bridge $br_name is missing."
			case $port in
				pf*sf*)
					info "RDMA functionality is not expected to work without $port in $br_name"
				;;
				*)
					missing_port=$((missing_port+1))
				;;
			esac
		fi
	done

	if [ $missing_port -gt 0 ]; then
		info "Skipping $br_name configuration."
		continue
	fi

	$vsctl add-br $br_name
	info "Created bridge: $br_name"
	for port in $ovs_br_ports
	do
		$vsctl add-port $br_name $port
		info "bridge $br_name: added port $port"
	done
	if ! [[ $(uname -r) =~ "4.19" ]]; then
		ip link set $br_name up
	fi

done

if [ "X${OVS_HW_OFFLOAD}" == "Xyes" ]; then
	$vsctl --no-wait set Open_vSwitch . Other_config:hw-offload=true
	if [ $? -eq 0 ]; then
		info "OVS HW offload is set"
		info "Going to restart $ovs_service to activate hw-offload"
		need_restart=true
	fi
fi
} &

#Indication that the script has finished and other underlying services can start
if systemctl is-active --quiet mlnx_bf_configure_sync.service; then
	systemctl stop --no-block mlnx_bf_configure_sync
fi

if [ $need_restart == true ];then
	ovs_restart
fi

sync

exit $RC
