From ca869878b4e2f3139912daa81e31d4bcb951899e Mon Sep 17 00:00:00 2001
From: Mark Michelson <mmichelson@digium.com>
Date: Thu, 7 Jan 2016 15:37:36 -0600
Subject: [PATCH] PJSIP: Prevent deadlock due to dialog/transaction lock
 inversion.

A deadlock was observed where the monitor thread was stuck, therefore
resulting in no incoming SIP traffic being processed.

The problem occurred when two 200 OK responses arrived in response to a
terminating NOTIFY request sent from Asterisk. The first 200 OK was
dispatched to a threadpool worker, who locked the corresponding
transaction. The second 200 OK arrived, resulting in the monitor thread
locking the dialog. At this point, the two threads are at odds, because
the monitor thread attempts to lock the transaction, and the threadpool
thread loops attempting to try to lock the dialog.

In this case, the fix is to not have the monitor thread attempt to hold
both the dialog and transaction locks at the same time. Instead, we
release the dialog lock before attempting to lock the transaction.

There have also been some debug messages added to the process in an
attempt to make it more clear what is going on in the process.

ASTERISK-25668 #close
Reported by Mark Michelson

Change-Id: I4db0705f1403737b4360e33a8e6276805d086d4a
---
 res/res_pjsip/pjsip_distributor.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/res/res_pjsip/pjsip_distributor.c b/res/res_pjsip/pjsip_distributor.c
index ea7419b6ee..79c1916510 100644
--- a/res/res_pjsip/pjsip_distributor.c
+++ b/res/res_pjsip/pjsip_distributor.c
@@ -104,6 +104,10 @@ static struct ast_taskprocessor *find_request_serializer(pjsip_rx_data *rdata)
 		serializer_name = tsx->last_tx->mod_data[distributor_mod.id];
 		if (!ast_strlen_zero(serializer_name)) {
 			serializer = ast_taskprocessor_get(serializer_name, TPS_REF_IF_EXISTS);
+			if (serializer) {
+				ast_debug(3, "Found serializer %s on transaction %s\n",
+						serializer_name, tsx->obj_name);
+			}
 		}
 	}
 
@@ -252,27 +256,34 @@ static pj_bool_t distributor(pjsip_rx_data *rdata)
 	pjsip_dialog *dlg = find_dialog(rdata);
 	struct distributor_dialog_data *dist = NULL;
 	struct ast_taskprocessor *serializer = NULL;
-	struct ast_taskprocessor *req_serializer = NULL;
 	pjsip_rx_data *clone;
 
 	if (dlg) {
+		ast_debug(3, "Searching for serializer on dialog %s for %s\n",
+				dlg->obj_name, rdata->msg_info.info);
 		dist = pjsip_dlg_get_mod_data(dlg, distributor_mod.id);
 		if (dist) {
-			serializer = dist->serializer;
+			serializer = ao2_bump(dist->serializer);
+			if (serializer) {
+				ast_debug(3, "Found serializer %s on dialog %s\n",
+						ast_taskprocessor_name(serializer), dlg->obj_name);
+			}
 		}
+		pjsip_dlg_dec_lock(dlg);
 	}
 
 	if (serializer) {
 		/* We have a serializer so we know where to send the message. */
 	} else if (rdata->msg_info.msg->type == PJSIP_RESPONSE_MSG) {
-		req_serializer = find_request_serializer(rdata);
-		serializer = req_serializer;
+		ast_debug(3, "No dialog serializer for response %s. Using request transaction as basis\n",
+				rdata->msg_info.info);
+		serializer = find_request_serializer(rdata);
 	} else if (!pjsip_method_cmp(&rdata->msg_info.msg->line.req.method, &pjsip_cancel_method)
 		|| !pjsip_method_cmp(&rdata->msg_info.msg->line.req.method, &pjsip_bye_method)) {
 		/* We have a BYE or CANCEL request without a serializer. */
 		pjsip_endpt_respond_stateless(ast_sip_get_pjsip_endpoint(), rdata,
 			PJSIP_SC_CALL_TSX_DOES_NOT_EXIST, NULL, NULL, NULL);
-		goto end;
+		return PJ_TRUE;
 	}
 
 	pjsip_rx_data_clone(rdata, 0, &clone);
@@ -295,11 +306,7 @@ static pj_bool_t distributor(pjsip_rx_data *rdata)
 		ast_sip_push_task(serializer, distribute, clone);
 	}
 
-end:
-	if (dlg) {
-		pjsip_dlg_dec_lock(dlg);
-	}
-	ast_taskprocessor_unreference(req_serializer);
+	ast_taskprocessor_unreference(serializer);
 
 	return PJ_TRUE;
 }
-- 
GitLab