From 855ed6c1685a265d74208ad284e5a6945afb9168 Mon Sep 17 00:00:00 2001
From: Joshua Elson <joshelson@gmail.com>
Date: Mon, 18 Mar 2024 15:14:36 -0400
Subject: [PATCH] Implement Configurable TCP Keepalive Settings in PJSIP
 Transports

This commit introduces configurable TCP keepalive settings for both TCP and TLS transports. The changes allow for finer control over TCP connection keepalives, enhancing stability and reliability in environments prone to connection timeouts or where intermediate devices may prematurely close idle connections. This has proven necessary and has already been tested in production in several specialized environments where access to the underlying transport is unreliable in ways invisible to the operating system directly, so these keepalive and timeout mechanisms are necessary.

Fixes #657

(cherry picked from commit 3d40d3427106fc751d77d2d5f182a92becfdd53a)
---
 configs/samples/pjsip.conf.sample             | 26 +++++
 ...6f03e_add_tcp_keepalive_settings_to_ps_.py | 28 ++++++
 include/asterisk/res_pjsip.h                  |  8 ++
 res/res_pjsip/config_transport.c              | 99 +++++++++++++++++--
 res/res_pjsip/pjsip_config.xml                | 24 +++++
 third-party/pjproject/patches/config_site.h   |  9 ++
 6 files changed, 184 insertions(+), 10 deletions(-)
 create mode 100644 contrib/ast-db-manage/config/versions/8fce8496f03e_add_tcp_keepalive_settings_to_ps_.py

diff --git a/configs/samples/pjsip.conf.sample b/configs/samples/pjsip.conf.sample
index 247b540276..9d454089d3 100644
--- a/configs/samples/pjsip.conf.sample
+++ b/configs/samples/pjsip.conf.sample
@@ -171,6 +171,32 @@
 ;type=transport
 ;protocol=flow
 
+; Example IPv4 TCP transport with Keepalive options
+;
+;[transport-tcp]
+;type=transport
+;protocol=tcp
+;bind=0.0.0.0
+;tcp_keepalive_enable=yes        ; Enable TCP keepalive (yes/no)
+;tcp_keepalive_idle_time=30      ; Time in seconds the connection needs to remain idle before TCP starts sending keepalive probes
+;tcp_keepalive_interval_time=10  ; The time in seconds between individual keepalive probes
+;tcp_keepalive_probe_count=5     ; The maximum number of keepalive probes TCP should send before dropping the connection
+
+; Example IPv4 TLS transport with Keepalive options
+;
+;[transport-tls]
+;type=transport
+;protocol=tls
+;bind=0.0.0.0
+;cert_file=/path/to/mycert.crt
+;priv_key_file=/path/to/mykey.key
+;cipher=ADH-AES256-SHA,ADH-AES128-SHA
+;method=tlsv1
+;tcp_keepalive_enable=yes        ; Enable TCP keepalive (yes/no)
+;tcp_keepalive_idle_time=30      ; Time in seconds the connection needs to remain idle before TCP starts sending keepalive probes
+;tcp_keepalive_interval_time=10  ; The time in seconds between individual keepalive probes
+;tcp_keepalive_probe_count=5     ; The maximum number of keepalive probes TCP should send before dropping the connection
+
 ;===============OUTBOUND REGISTRATION WITH OUTBOUND AUTHENTICATION============
 ;
 ; This is a simple registration that works with some SIP trunking providers.
diff --git a/contrib/ast-db-manage/config/versions/8fce8496f03e_add_tcp_keepalive_settings_to_ps_.py b/contrib/ast-db-manage/config/versions/8fce8496f03e_add_tcp_keepalive_settings_to_ps_.py
new file mode 100644
index 0000000000..4844a60ed7
--- /dev/null
+++ b/contrib/ast-db-manage/config/versions/8fce8496f03e_add_tcp_keepalive_settings_to_ps_.py
@@ -0,0 +1,28 @@
+"""Add TCP keepalive settings to ps_transports
+
+Revision ID: 8fce8496f03e
+Revises: 74dc751dfe8e
+Create Date: 2024-03-18 17:00:17.148018
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = '8fce8496f03e'
+down_revision = '74dc751dfe8e'
+
+from alembic import op
+import sqlalchemy as sa
+
+def upgrade():
+    with op.batch_alter_table('ps_transports') as batch_op:
+        batch_op.add_column(sa.Column('tcp_keepalive_enable', sa.Boolean(), nullable=True))
+        batch_op.add_column(sa.Column('tcp_keepalive_idle_time', sa.Integer(), nullable=True))
+        batch_op.add_column(sa.Column('tcp_keepalive_interval_time', sa.Integer(), nullable=True))
+        batch_op.add_column(sa.Column('tcp_keepalive_probe_count', sa.Integer(), nullable=True))
+
+def downgrade():
+    with op.batch_alter_table('ps_transports') as batch_op:
+        batch_op.drop_column('tcp_keepalive_enable')
+        batch_op.drop_column('tcp_keepalive_idle_time')
+        batch_op.drop_column('tcp_keepalive_interval_time')
+        batch_op.drop_column('tcp_keepalive_probe_count')
diff --git a/include/asterisk/res_pjsip.h b/include/asterisk/res_pjsip.h
index b320cff525..214605209a 100644
--- a/include/asterisk/res_pjsip.h
+++ b/include/asterisk/res_pjsip.h
@@ -299,6 +299,14 @@ struct ast_sip_transport {
 	int symmetric_transport;
 	/*! This is a flow to another target */
 	int flow;
+	/*! Enable TCP keepalive */
+	int tcp_keepalive_enable;
+	/*! Time in seconds the connection needs to remain idle before TCP starts sending keepalive probes */
+	int tcp_keepalive_idle_time;
+	/*! The time in seconds between individual keepalive probes */
+	int tcp_keepalive_interval_time;
+	/*! The maximum number of keepalive probes TCP should send before dropping the connection */
+	int tcp_keepalive_probe_count;
 };
 
 #define SIP_SORCERY_DOMAIN_ALIAS_TYPE "domain_alias"
diff --git a/res/res_pjsip/config_transport.c b/res/res_pjsip/config_transport.c
index 07b60a9a70..f538eaf862 100644
--- a/res/res_pjsip/config_transport.c
+++ b/res/res_pjsip/config_transport.c
@@ -828,17 +828,55 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj)
 	} else if (transport->type == AST_TRANSPORT_TCP) {
 		pjsip_tcp_transport_cfg cfg;
 		static int option = 1;
+		int sockopt_count = 0;
 
 		pjsip_tcp_transport_cfg_default(&cfg, temp_state->state->host.addr.sa_family);
 		cfg.bind_addr = temp_state->state->host;
 		cfg.async_cnt = transport->async_operations;
 		set_qos(transport, &cfg.qos_params);
+
 		/* sockopt_params.options is copied to each newly connected socket */
-		cfg.sockopt_params.options[0].level = pj_SOL_TCP();
-		cfg.sockopt_params.options[0].optname = pj_TCP_NODELAY();
-		cfg.sockopt_params.options[0].optval = &option;
-		cfg.sockopt_params.options[0].optlen = sizeof(option);
-		cfg.sockopt_params.cnt = 1;
+		cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+		cfg.sockopt_params.options[sockopt_count].optname = pj_TCP_NODELAY();
+		cfg.sockopt_params.options[sockopt_count].optval = &option;
+		cfg.sockopt_params.options[sockopt_count].optlen = sizeof(option);
+		sockopt_count++;
+
+		if (transport->tcp_keepalive_enable) {
+#if defined(PJ_MAX_SOCKOPT_PARAMS) && PJ_MAX_SOCKOPT_PARAMS >= 5
+			ast_log(LOG_DEBUG, "TCP Keepalive enabled for transport '%s'. Idle Time: %d, Interval: %d, Count: %d\n",
+				ast_sorcery_object_get_id(obj), transport->tcp_keepalive_idle_time, transport->tcp_keepalive_interval_time, transport->tcp_keepalive_probe_count);
+
+			cfg.sockopt_params.options[sockopt_count].level = pj_SOL_SOCKET();
+			cfg.sockopt_params.options[sockopt_count].optname = SO_KEEPALIVE;
+			cfg.sockopt_params.options[sockopt_count].optval = &option;
+			cfg.sockopt_params.options[sockopt_count].optlen = sizeof(option);
+			sockopt_count++;
+
+			cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPIDLE;
+			cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_idle_time;
+			cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_idle_time);
+			sockopt_count++;
+
+			cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPINTVL;
+			cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_interval_time;
+			cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_interval_time);
+			sockopt_count++;
+
+			cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPCNT;
+			cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_probe_count;
+			cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_probe_count);
+			sockopt_count++;
+#else
+			ast_log(LOG_WARNING, "TCP keepalive settings for '%s' not set due to PJSIP built without support for setting all options. Consider using bundled PJSIP.\n",
+				ast_sorcery_object_get_id(obj));
+#endif
+		}
+
+		cfg.sockopt_params.cnt = sockopt_count;
 
 		for (i = 0; i < BIND_TRIES && res != PJ_SUCCESS; i++) {
 			if (perm_state && perm_state->state && perm_state->state->factory
@@ -853,6 +891,7 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj)
 	} else if (transport->type == AST_TRANSPORT_TLS) {
 #if defined(PJ_HAS_SSL_SOCK) && PJ_HAS_SSL_SOCK != 0
 		static int option = 1;
+		int sockopt_count = 0;
 
 		if (transport->async_operations > 1 && ast_compare_versions(pj_get_version(), "2.5.0") < 0) {
 			ast_log(LOG_ERROR, "Transport: %s: When protocol=tls and pjproject version < 2.5.0, async_operations can't be > 1\n",
@@ -864,11 +903,47 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj)
 		set_qos(transport, &temp_state->state->tls.qos_params);
 
 		/* sockopt_params.options is copied to each newly connected socket */
-		temp_state->state->tls.sockopt_params.options[0].level = pj_SOL_TCP();
-		temp_state->state->tls.sockopt_params.options[0].optname = pj_TCP_NODELAY();
-		temp_state->state->tls.sockopt_params.options[0].optval = &option;
-		temp_state->state->tls.sockopt_params.options[0].optlen = sizeof(option);
-		temp_state->state->tls.sockopt_params.cnt = 1;
+		temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+		temp_state->state->tls.sockopt_params.options[sockopt_count].optname = pj_TCP_NODELAY();
+		temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &option;
+		temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(option);
+		sockopt_count++;
+
+		if (transport->tcp_keepalive_enable) {
+#if defined(PJ_MAX_SOCKOPT_PARAMS) && PJ_MAX_SOCKOPT_PARAMS >= 5
+			ast_log(LOG_DEBUG, "TCP Keepalive enabled for transport '%s'. Idle Time: %d, Interval: %d, Count: %d\n",
+				ast_sorcery_object_get_id(obj), transport->tcp_keepalive_idle_time, transport->tcp_keepalive_interval_time, transport->tcp_keepalive_probe_count);
+
+			temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_SOCKET();
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optname = SO_KEEPALIVE;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &option;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(option);
+			sockopt_count++;
+
+			temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPIDLE;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_idle_time;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_idle_time);
+			sockopt_count++;
+
+			temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPINTVL;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_interval_time;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_interval_time);
+			sockopt_count++;
+
+			temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPCNT;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_probe_count;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_probe_count);
+			sockopt_count++;
+#else
+			ast_log(LOG_WARNING, "TCP keepalive settings for '%s' not set due to PJSIP built without support for setting all options. Consider using bundled PJSIP.\n",
+				ast_sorcery_object_get_id(obj));
+#endif
+		}
+
+		temp_state->state->tls.sockopt_params.cnt = sockopt_count;
 
 		for (i = 0; i < BIND_TRIES && res != PJ_SUCCESS; i++) {
 			if (perm_state && perm_state->state && perm_state->state->factory
@@ -1760,6 +1835,10 @@ int ast_sip_initialize_sorcery_transport(void)
 	ast_sorcery_object_field_register_custom(sorcery, "transport", "require_client_cert", "", transport_tls_bool_handler, require_client_cert_to_str, NULL, 0, 0);
 	ast_sorcery_object_field_register_custom(sorcery, "transport", "allow_wildcard_certs", "", transport_tls_bool_handler, allow_wildcard_certs_to_str, NULL, 0, 0);
 	ast_sorcery_object_field_register_custom(sorcery, "transport", "method", "", transport_tls_method_handler, tls_method_to_str, NULL, 0, 0);
+	ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_enable", "no", OPT_BOOL_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_enable));
+	ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_idle_time", "30", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_idle_time));
+	ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_interval_time", "1", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_interval_time));
+	ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_probe_count", "5", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_probe_count));
 #if defined(PJ_HAS_SSL_SOCK) && PJ_HAS_SSL_SOCK != 0
 	ast_sorcery_object_field_register_custom(sorcery, "transport", "cipher", "", transport_tls_cipher_handler, transport_tls_cipher_to_str, NULL, 0, 0);
 #endif
diff --git a/res/res_pjsip/pjsip_config.xml b/res/res_pjsip/pjsip_config.xml
index 9eb61e9302..6a8421f18b 100644
--- a/res/res_pjsip/pjsip_config.xml
+++ b/res/res_pjsip/pjsip_config.xml
@@ -1800,6 +1800,30 @@
 				<configOption name="require_client_cert" default="false">
 					<synopsis>Require client certificate (TLS ONLY, not WSS)</synopsis>
 				</configOption>
+				<configOption name="tcp_keepalive_enable" default="no">
+					<synopsis>Enable TCP keepalive</synopsis>
+					<description><para>
+						When set to 'yes', TCP keepalive messages are sent to verify that the endpoint is still reachable. This can help detect dead TCP connections in environments where connections may be silently dropped (e.g., NAT timeouts).
+					</para></description>
+				</configOption>
+				<configOption name="tcp_keepalive_idle_time" default="30">
+					<synopsis>Idle time before the first TCP keepalive probe is sent</synopsis>
+					<description><para>
+						Specifies the amount of time in seconds that the connection must be idle before the first TCP keepalive probe is sent. An idle connection is defined as a connection in which no data has been sent or received by the application.
+					</para></description>
+				</configOption>
+				<configOption name="tcp_keepalive_interval_time" default="10">
+					<synopsis>Interval between TCP keepalive probes</synopsis>
+					<description><para>
+						Specifies the interval in seconds between individual TCP keepalive probes, once the first probe is sent. This interval is used for subsequent probes if the peer does not respond to the previous probe.
+					</para></description>
+				</configOption>
+				<configOption name="tcp_keepalive_probe_count" default="5">
+					<synopsis>Maximum number of TCP keepalive probes</synopsis>
+					<description><para>
+						Specifies the maximum number of TCP keepalive probes to send before considering the connection dead and notifying the application. If the peer does not respond after this many probes, the connection is considered broken.
+					</para></description>
+				</configOption>
 				<configOption name="type">
 					<synopsis>Must be of type 'transport'.</synopsis>
 				</configOption>
diff --git a/third-party/pjproject/patches/config_site.h b/third-party/pjproject/patches/config_site.h
index bb40c7bcd4..0492b04812 100644
--- a/third-party/pjproject/patches/config_site.h
+++ b/third-party/pjproject/patches/config_site.h
@@ -35,6 +35,15 @@
 #define PJ_IOQUEUE_HAS_SAFE_UNREG 1
 #define PJ_IOQUEUE_MAX_EVENTS_IN_SINGLE_POLL (16)
 
+/*
+ * Increase the number of socket options available. This adjustment is necessary
+ * to accommodate additional TCP keepalive settings required for optimizing SIP
+ * transport stability, especially in environments prone to connection timeouts.
+ * The default limit is insufficient when configuring all desired keepalive
+ * parameters along with standard socket options.
+ */
+#define PJ_MAX_SOCKOPT_PARAMS 5
+
 #define PJ_SCANNER_USE_BITWISE	0
 #define PJ_OS_HAS_CHECK_STACK	0
 
-- 
GitLab