CrateDB 5.4: Write thread pool getting stuck after upgrade from 5.3.2

After upgrading from CrateDB 5.3.2 to 5.4.2 with CrateDB JDBC 2.6.0 jar, all write threads are getting blocked with single monitor. Below is the thread dump url for reference.

Thread dump report: Java thread dump analysis report
Link to download :Download Threaddump

cratedb[crate03][write][T#25]" #282 [3360260] daemon prio=5 os_prio=0 cpu=99514.17ms elapsed=149022.00s tid=0x00007f313c017fb0 nid=3360260 waiting on condition  [0x00007ea4087d7000]
   java.lang.Thread.State: WAITING (parking)
	at jdk.internal.misc.Unsafe.park(java.base@20.0.2/Native Method)
	- parking to wait for  <0x00007f3304bdd638> (a java.util.concurrent.LinkedTransferQueue)
	at java.util.concurrent.locks.LockSupport.park(java.base@20.0.2/LockSupport.java:371)
	at java.util.concurrent.LinkedTransferQueue$Node.block(java.base@20.0.2/LinkedTransferQueue.java:470)
	at java.util.concurrent.ForkJoinPool.unmanagedBlock(java.base@20.0.2/ForkJoinPool.java:3780)
	at java.util.concurrent.ForkJoinPool.managedBlock(java.base@20.0.2/ForkJoinPool.java:3725)
	at java.util.concurrent.LinkedTransferQueue.awaitMatch(java.base@20.0.2/LinkedTransferQueue.java:669)
--
	at java.lang.Thread.run(java.base@20.0.2/Thread.java:1623)

"cratedb[crate03][write][T#26]" #283 [3360261] daemon prio=5 os_prio=0 cpu=99261.10ms elapsed=149022.00s tid=0x00007f3120007dd0 nid=3360261 waiting on condition  [0x00007ea4086d6000]
   java.lang.Thread.State: WAITING (parking)
	at jdk.internal.misc.Unsafe.park(java.base@20.0.2/Native Method)
	- parking to wait for  <0x00007f3304bdd638> (a java.util.concurrent.LinkedTransferQueue)
	at java.util.concurrent.locks.LockSupport.park(java.base@20.0.2/LockSupport.java:371)
	at java.util.concurrent.LinkedTransferQueue$Node.block(java.base@20.0.2/LinkedTransferQueue.java:470)
	at java.util.concurrent.ForkJoinPool.unmanagedBlock(java.base@20.0.2/ForkJoinPool.java:3780)
	at java.util.concurrent.ForkJoinPool.managedBlock(java.base@20.0.2/ForkJoinPool.java:3725)
	at java.util.concurrent.LinkedTransferQueue.awaitMatch(java.base@20.0.2/LinkedTransferQueue.java:669)
--
	at java.lang.Thread.run(java.base@20.0.2/Thread.java:1623)

"cratedb[crate03][write][T#27]" #284 [3360262] daemon prio=5 os_prio=0 cpu=98810.41ms elapsed=149022.00s tid=0x00007f314400d960 nid=3360262 waiting on condition  [0x00007ea4085d5000]
   java.lang.Thread.State: WAITING (parking)
	at jdk.internal.misc.Unsafe.park(java.base@20.0.2/Native Method)
	- parking to wait for  <0x00007f3304bdd638> (a java.util.concurrent.LinkedTransferQueue)
	at java.util.concurrent.locks.LockSupport.park(java.base@20.0.2/LockSupport.java:371)
	at java.util.concurrent.LinkedTransferQueue$Node.block(java.base@20.0.2/LinkedTransferQueue.java:470)
	at java.util.concurrent.ForkJoinPool.unmanagedBlock(java.base@20.0.2/ForkJoinPool.java:3780)
	at java.util.concurrent.ForkJoinPool.managedBlock(java.base@20.0.2/ForkJoinPool.java:3725)
	at java.util.concurrent.LinkedTransferQueue.awaitMatch(java.base@20.0.2/LinkedTransferQueue.java:669)
--
	at java.lang.Thread.run(java.base@20.0.2/Thread.java:1623)

"cratedb[crate03][write][T#28]" #285 [3360263] daemon prio=5 os_prio=0 cpu=99012.50ms elapsed=149022.00s tid=0x00007f315004efb0 nid=3360263 waiting on condition  [0x00007ea4084d4000]
   java.lang.Thread.State: WAITING (parking)
	at jdk.internal.misc.Unsafe.park(java.base@20.0.2/Native Method)
	- parking to wait for  <0x00007f3304bdd638> (a java.util.concurrent.LinkedTransferQueue)
	at java.util.concurrent.locks.LockSupport.park(java.base@20.0.2/LockSupport.java:371)
	at java.util.concurrent.LinkedTransferQueue$Node.block(java.base@20.0.2/LinkedTransferQueue.java:470)
	at java.util.concurrent.ForkJoinPool.unmanagedBlock(java.base@20.0.2/ForkJoinPool.java:3780)
	at java.util.concurrent.ForkJoinPool.managedBlock(java.base@20.0.2/ForkJoinPool.java:3725)
	at java.util.concurrent.LinkedTransferQueue.awaitMatch(java.base@20.0.2/LinkedTransferQueue.java:669)
--
	at java.lang.Thread.run(java.base@20.0.2/Thread.java:1623)

"cratedb[crate03][write][T#29]" #286 [3360264] daemon prio=5 os_prio=0 cpu=99243.00ms elapsed=149021.99s tid=0x00007f3148021310 nid=3360264 waiting on condition  [0x00007ea4083d3000]
   java.lang.Thread.State: WAITING (parking)
	at jdk.internal.misc.Unsafe.park(java.base@20.0.2/Native Method)
	- parking to wait for  <0x00007f3304bdd638> (a java.util.concurrent.LinkedTransferQueue)
	at java.util.concurrent.locks.LockSupport.park(java.base@20.0.2/LockSupport.java:371)
	at java.util.concurrent.LinkedTransferQueue$Node.block(java.base@20.0.2/LinkedTransferQueue.java:470)
	at java.util.concurrent.ForkJoinPool.unmanagedBlock(java.base@20.0.2/ForkJoinPool.java:3780)
	at java.util.concurrent.ForkJoinPool.managedBlock(java.base@20.0.2/ForkJoinPool.java:3725)
	at java.util.concurrent.LinkedTransferQueue.awaitMatch(java.base@20.0.2/LinkedTransferQueue.java:669)
--
	at java.lang.Thread.run(java.base@20.0.2/Thread.java:1623)

"cratedb[crate03][write][T#30]" #287 [3360265] daemon prio=5 os_prio=0 cpu=99556.14ms elapsed=149021.99s tid=0x00007f31500712f0 nid=3360265 waiting on condition  [0x00007ea4082d2000]
   java.lang.Thread.State: WAITING (parking)
	at jdk.internal.misc.Unsafe.park(java.base@20.0.2/Native Method)
	- parking to wait for  <0x00007f3304bdd638> (a java.util.concurrent.LinkedTransferQueue)
	at java.util.concurrent.locks.LockSupport.park(java.base@20.0.2/LockSupport.java:371)
	at java.util.concurrent.LinkedTransferQueue$Node.block(java.base@20.0.2/LinkedTransferQueue.java:470)
	at java.util.concurrent.ForkJoinPool.unmanagedBlock(java.base@20.0.2/ForkJoinPool.java:3780)
	at java.util.concurrent.ForkJoinPool.managedBlock(java.base@20.0.2/ForkJoinPool.java:3725)
	at java.util.concurrent.LinkedTransferQueue.awaitMatch(java.base@20.0.2/LinkedTransferQueue.java:669)
--
	at java.lang.Thread.run(java.base@20.0.2/Thread.java:1623)

"cratedb[crate03][write][T#31]" #288 [3360266] daemon prio=5 os_prio=0 cpu=99505.87ms elapsed=149021.99s tid=0x00007f3150072c70 nid=3360266 waiting on condition  [0x00007ea4081d1000]
   java.lang.Thread.State: WAITING (parking)
	at jdk.internal.misc.Unsafe.park(java.base@20.0.2/Native Method)
	- parking to wait for  <0x00007f3304bdd638> (a java.util.concurrent.LinkedTransferQueue)
	at java.util.concurrent.locks.LockSupport.park(java.base@20.0.2/LockSupport.java:371)
	at java.util.concurrent.LinkedTransferQueue$Node.block(java.base@20.0.2/LinkedTransferQueue.java:470)
	at java.util.concurrent.ForkJoinPool.unmanagedBlock(java.base@20.0.2/ForkJoinPool.java:3780)
	at java.util.concurrent.ForkJoinPool.managedBlock(java.base@20.0.2/ForkJoinPool.java:3725)
	at java.util.concurrent.LinkedTransferQueue.awaitMatch(java.base@20.0.2/LinkedTransferQueue.java:669)
--
	at java.lang.Thread.run(java.base@20.0.2/Thread.java:1623)

"cratedb[crate03][write][T#32]" #289 [3360267] daemon prio=5 os_prio=0 cpu=99367.26ms elapsed=149021.99s tid=0x00007f31480222c0 nid=3360267 waiting on condition  [0x00007ea3dbffe000]
   java.lang.Thread.State: WAITING (parking)
	at jdk.internal.misc.Unsafe.park(java.base@20.0.2/Native Method)
	- parking to wait for  <0x00007f3304bdd638> (a java.util.concurrent.LinkedTransferQueue)
	at java.util.concurrent.locks.LockSupport.park(java.base@20.0.2/LockSupport.java:371)
	at java.util.concurrent.LinkedTransferQueue$Node.block(java.base@20.0.2/LinkedTransferQueue.java:470)
	at java.util.concurrent.ForkJoinPool.unmanagedBlock(java.base@20.0.2/ForkJoinPool.java:3780)
	at java.util.concurrent.ForkJoinPool.managedBlock(java.base@20.0.2/ForkJoinPool.java:3725)
	at java.util.concurrent.LinkedTransferQueue.awaitMatch(java.base@20.0.2/LinkedTransferQueue.java:669)

Hi @Satishlk ,
Could you elaborate a bit on your setup and the issue you are observing at a higher level? are you running INSERTs or UPDATEs? are there any exceptions logged in the CrateDB log on the nodes?

1 Like

Hi @hernanc ,

We are using 3 nodes cluster with 32 CPU and 48 GB ram.We are using Crate as secondary database for searches of txn data .We have pipeline job which sync data from mysql to kafka to crate database . We have exposed api’s to aggregate data to show in client application.Insertions are generally at peak of 100-1000 per second. Search request volume will be 1.5k-2.0k request per minute. We are not seeing any exception in logs,but search threads getting clogged as soon we are routing request to crate instance.

We have an older instance 3.0.6 of crate which is able to manage the same workload with higher response time.We had created new instance of 5.3.2 and then migrated to 5.4.2. We have seen behavior with search,write ,get threads waiting on the same resource when we checked thread dump. We are using java spring boot to connect to crate database with crate jdbc driver 2.6.0.

Hi
After running optimize and analyze and reducing connections on crate cluster, system was running fine for couple of weeks with reduced response time from 20sec to 300 ms and reduced cpu from 100% to 30%.We have upgraded kernal version on crate cluster aws ec2 machines we started seeing similar behavior of requests getting blocked and seeing below errors on one of the cluster nodes.Can you please help how to fix shard inactive error.

Seeing below messages in one of cluster logs

Oct  4 10:31:37 ip-10-207-97-170 crate[1701792]: [2023-10-04T10:31:37,462][DEBUG][o.e.i.f.SyncedFlushService] [crate03] flushing shard [ezetap..partitioned.txn.042j4c9g64][7], node[mi9WL0vkS3K02Fcqew4SAw], [R], s[STARTED], a[id=M13XQUKCQLu0SB90hxdnfg] on inactive
Oct  4 10:31:37 ip-10-207-97-170 crate[1701792]: [2023-10-04T10:31:37,462][DEBUG][o.e.i.s.IndexShard       ] [crate03]  [ezetap..partitioned.txn.042j4c9g64][8]shard is now inactive
Oct  4 10:31:37 ip-10-207-97-170 crate[1701792]: [2023-10-04T10:31:37,462][DEBUG][o.e.i.f.SyncedFlushService] [crate03] flushing shard [ezetap..partitioned.txn.042j4c9g64][8], node[mi9WL0vkS3K02Fcqew4SAw], [R], s[STARTED], a[id=6TEib-zzSvSWQfej2mn5Ug] on inactive
Oct  4 10:31:37 ip-10-207-97-170 crate[1701792]: [2023-10-04T10:31:37,462][DEBUG][o.e.i.s.IndexShard       ] [crate03]  [ezetap..partitioned.txn.042j4c9g64][9]shard is now inactive
Oct  4 10:31:37 ip-10-207-97-170 crate[1701792]: [2023-10-04T10:31:37,462][DEBUG][o.e.i.f.SyncedFlushService] [crate03] flushing shard [ezetap..partitioned.txn.042j4c9g64][9], node[mi9WL0vkS3K02Fcqew4SAw], [R], s[STARTED], a[id=7bOBBI_2QjuGbAvv9FKKvg] on inactive
ct  4 10:31:37 ip-10-207-97-170 crate[1701792]: [2023-10-04T10:31:37,462][DEBUG][o.e.i.f.SyncedFlushService] [crate03] flushing shard [ezetap..partitioned.txn.042j4c9g64][8], node[mi9WL0vkS3K02Fcqew4SAw], [R], s[STARTED], a[id=6TEib-zzSvSWQfej2mn5Ug] on inactive
Oct  4 10:31:37 ip-10-207-97-170 crate[1701792]: [2023-10-04T10:31:37,462][DEBUG][o.e.i.s.IndexShard       ] [crate03]  [ezetap..partitioned.txn.042j4c9g64][9]shard is now inactive

Threads blocking on CrateDB database

"cratedb[crate02][get][T#1]" #124 [1430238] daemon prio=5 os_prio=0 cpu=8.18ms elapsed=74999.96s tid=0x00007f83281c3010 nid=1430238 waiting on condition  [0x00007f83992f1000]
"cratedb[crate02][get][T#2]" #127 [1430411] daemon prio=5 os_prio=0 cpu=3.86ms elapsed=74988.70s tid=0x00007f80344ccfb0 nid=1430411 waiting on condition  [0x00007f8015b4a000]
"cratedb[crate02][get][T#3]" #132 [1430608] daemon prio=5 os_prio=0 cpu=16.11ms elapsed=74917.52s tid=0x00007f825c0db3c0 nid=1430608 waiting on condition  [0x00007f8039dea000]
"cratedb[crate02][get][T#4]" #135 [1430796] daemon prio=5 os_prio=0 cpu=10.33ms elapsed=74869.66s tid=0x00007f802413af30 nid=1430796 waiting on condition  [0x00007f800fb4e000]
"cratedb[crate02][get][T#5]" #137 [1430816] daemon prio=5 os_prio=0 cpu=6.98ms elapsed=74824.45s tid=0x00007f80004149a0 nid=1430816 waiting on condition  [0x00007f7f44e7a000]
"cratedb[crate02][get][T#6]" #179 [1431046] daemon prio=5 os_prio=0 cpu=8.47ms elapsed=74780.09s tid=0x00007f1a982dd5e0 nid=1431046 waiting on condition  [0x00007f0577289000]
"cratedb[crate02][get][T#7]" #185 [1431238] daemon prio=5 os_prio=0 cpu=10.03ms elapsed=74716.02s tid=0x00007f14bc0040b0 nid=1431238 waiting on condition  [0x00007ef19a706000]
"cratedb[crate02][get][T#8]" #191 [1431246] daemon prio=5 os_prio=0 cpu=3.08ms elapsed=74697.62s tid=0x00007f14c806bc40 nid=1431246 waiting on condition  [0x00007eeeb7bcf000]
"cratedb[crate02][get][T#9]" #17402 [1521787] daemon prio=5 os_prio=0 cpu=1.35ms elapsed=50234.44s tid=0x00007efc24005090 nid=1521787 waiting on condition  [0x00007f81cebf9000]
"cratedb[crate02][get][T#10]" #36675 [1669209] daemon prio=5 os_prio=0 cpu=8.21ms elapsed=7403.46s tid=0x00007eed840e3ef0 nid=1669209 waiting on condition  [0x00007f7d4c9ac000]

In case anyone comes across this thread in the future, the conversation has continued in Crate database showing “shard is now inactive” - CrateDB - CrateDB Community