[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
58 messages Options
123
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
GitHub user clebertsuconic opened a pull request:

    https://github.com/apache/activemq-artemis/pull/1443

    ARTEMIS-1324 Deadlock detection and health check of critical components

   

You can merge this pull request into a Git repository by running:

    $ git pull https://github.com/clebertsuconic/activemq-artemis critical-prototype

Alternatively you can review and apply these changes as the patch at:

    https://github.com/apache/activemq-artemis/pull/1443.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

    This closes #1443
   
----
commit 58474c42b74d31e4727d4da58553c01533d4f4e6
Author: Clebert Suconic <[hidden email]>
Date:   2017-08-05T04:52:42Z

    ARTEMIS-1324 Deadlock detection and health check of critical components

----


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
Github user michaelandrepearce commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131530496
 
    --- Diff: artemis-commons/src/main/java/org/apache/activemq/artemis/utils/critical/CriticalAnalyzerImpl.java ---
    @@ -0,0 +1,182 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements. See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License. You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.activemq.artemis.utils.critical;
    +
    +import java.util.ConcurrentModificationException;
    +import java.util.concurrent.CopyOnWriteArrayList;
    +import java.util.concurrent.Semaphore;
    +import java.util.concurrent.TimeUnit;
    +
    +import org.apache.activemq.artemis.utils.collections.ConcurrentHashSet;
    +import org.jboss.logging.Logger;
    +
    +public class CriticalAnalyzerImpl implements CriticalAnalyzer {
    +
    +   private final Logger logger = Logger.getLogger(CriticalAnalyzer.class);
    +
    +   private volatile long timeout;
    +
    +   private volatile long checkTime;
    +
    +   @Override
    +   public void clear() {
    +      actions.clear();
    +      components.clear();
    +   }
    +
    +   private CopyOnWriteArrayList<CriticalAction> actions = new CopyOnWriteArrayList<>();
    +
    +   private Thread thread;
    +
    +   private final Semaphore running = new Semaphore(1);
    +
    +   private final ConcurrentHashSet<CriticalComponent> components = new ConcurrentHashSet<>();
    +
    +   @Override
    +   public void add(CriticalComponent component) {
    +      components.add(component);
    +   }
    +
    +   @Override
    +   public void remove(CriticalComponent component) {
    +      components.remove(component);
    +   }
    +
    +   @Override
    +   public CriticalAnalyzer setCheckTime(long timeout) {
    +      this.checkTime = timeout;
    +      return this;
    +   }
    +
    +   @Override
    +   public long getCheckTime() {
    +      if (checkTime == 0) {
    +         checkTime = getTimeout() / 2;
    +      }
    +      return checkTime;
    +   }
    +
    +   @Override
    +   public CriticalAnalyzer setTimeout(long timeout) {
    +      this.timeout = timeout;
    +      return this;
    +   }
    +
    +   @Override
    +   public long getTimeout() {
    +      if (timeout == 0) {
    +         timeout = TimeUnit.MINUTES.toMillis(2);
    +      }
    +      return timeout;
    +   }
    +
    +   @Override
    +   public CriticalAnalyzer addAction(CriticalAction action) {
    +      this.actions.add(action);
    +      return this;
    +   }
    +
    +   @Override
    +   public void check() {
    +      boolean retry = true;
    +      while (retry) {
    +         try {
    +            for (CriticalComponent component : components) {
    +
    +               int pathReturned = component.isExpired(timeout);
    +               if (pathReturned >= 0) {
    +                  fireAction(component, pathReturned);
    +                  // no need to keep running if there's already a component failed
    +                  return;
    +               }
    +            }
    +            retry = false; // got to the end of the list, no need to retry
    +         } catch (ConcurrentModificationException dontCare) {
    +            // lets retry on the loop
    +         }
    +      }
    +   }
    +
    +   private void fireAction(CriticalComponent component, int path) {
    +      for (CriticalAction action: actions) {
    +         try {
    +            action.run(component, path);
    +         } catch (Throwable e) {
    +            logger.warn(e.getMessage(), e);
    +         }
    +      }
    +   }
    +
    +   @Override
    +   public void start() {
    +
    +      if (!running.tryAcquire()) {
    +         // already running
    +         return;
    +      }
    +
    +      // we are not using any Thread Pool or any Scheduled Executors from the ArtemisServer
    +      // as that would defeat the purpose,
    +      // as in any deadlocks the schedulers may be starving for something not responding fast enough
    +      thread = new Thread("Artemis Critical Analyzer") {
    +         @Override
    +         public void run() {
    +            try {
    +               while (true) {
    +                  if (running.tryAcquire(getCheckTime(), TimeUnit.MILLISECONDS)) {
    +                     running.release();
    +                     // this means that the server has been stopped as we could acquire the semaphore... returning now
    +                     break;
    +                  }
    +                  check();
    +               }
    +            } catch (InterruptedException interrupted) {
    +               // i will just leave on that case
    +            }
    +         }
    +      };
    +
    +      thread.setDaemon(true);
    +
    +      thread.start();
    +   }
    +
    +   @Override
    +   public void stop() {
    --- End diff --
   
    having start and stop, is a little risky for someone to miss remember to put the stop in a try, finally block in code which if forgot the stop could be quite fatal.
   
    An option (but may not be the right solution to the above problem) - is should it accept a lamda function / runnable / callable being the section to monitor. as such the try final can be in the core analyser and less likely or people to forget/miss later.
   
    The other option is that theres some static code analysis check to ensure if started there is a try finally stop within a method


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user michaelandrepearce commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131530423
 
    --- Diff: artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java ---
    @@ -608,6 +608,14 @@ public void parseMainConfig(final Element e, final Configuration config) throws
     
           config.setNetworkCheckPingCommand(getString(e, "network-check-ping-command", config.getNetworkCheckPingCommand(), Validators.NO_CHECK));
     
    +      config.setAnalyzeCritical(getBoolean(e, "analyze-critical", config.isAnalyzeCritical()));
    +
    +      config.setAnalyzeCriticalTimeout(getLong(e, "analyze-critical-timeout", config.getAnalyzeCriticalTimeout(), Validators.GE_ZERO));
    +
    +      config.setAnalyzeCriticalCheckPeriod(getLong(e, "analyze-critical-check-period", config.getAnalyzeCriticalCheckPeriod(), Validators.GE_ZERO));
    --- End diff --
   
    as per other review comment should the default not be half the timeout  if it is not set, not some arbitarty number which then forces the section of code later.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user michaelandrepearce commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131530398
 
    --- Diff: artemis-server/src/main/java/org/apache/activemq/artemis/core/config/impl/ConfigurationImpl.java ---
    @@ -2064,6 +2072,53 @@ public Configuration setNetworkCheckPing6Command(String command) {
           return this;
        }
     
    +   @Override
    +   public boolean isAnalyzeCritical() {
    +      return analyzeCritical;
    +   }
    +
    +   @Override
    +   public Configuration setAnalyzeCritical(boolean analyzeCritical) {
    +      this.analyzeCritical = analyzeCritical;
    +      return this;
    +   }
    +
    +   @Override
    +   public long getAnalyzeCriticalTimeout() {
    +      return analyzeCriticalTimeout;
    +   }
    +
    +   @Override
    +   public Configuration setAnalyzeCriticalTimeout(long timeout) {
    +      this.analyzeCriticalTimeout = timeout;
    +      return this;
    +   }
    +
    +   @Override
    +   public long getAnalyzeCriticalCheckPeriod() {
    +      if (analyzeCriticalCheckPeriod == ActiveMQDefaultConfiguration.getAnalyzeCriticalCheckPeriod()) {
    --- End diff --
   
    how would you differentiate between it being the default and someone actually have set it, should this defaulting not occur in config/broker.xml reading where if null then analyzeCriticalCheckPeriod is defaulted to half timeout?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis issue #1443: ARTEMIS-1324 Deadlock detection and health che...

franz1981
In reply to this post by franz1981
Github user michaelandrepearce commented on the issue:

    https://github.com/apache/activemq-artemis/pull/1443
 
    What is the over head of this, latency and throughput?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user michaelandrepearce commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131530849
 
    --- Diff: artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ActiveMQServerImpl.java ---
    @@ -477,12 +484,44 @@ public final synchronized void start() throws Exception {
           }
        }
     
    +   @Override
    +   public CriticalAnalyzer getCriticalAnalyzer() {
    +      return this.analyzer;
    +   }
    +
        private void internalStart() throws Exception {
           if (state != SERVER_STATE.STOPPED) {
              logger.debug("Server already started!");
              return;
           }
     
    +      if (!configuration.isAnalyzeCritical()) {
    --- End diff --
   
    why not do this within object construction?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user michaelandrepearce commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131530890
 
    --- Diff: artemis-server/src/test/resources/ConfigurationTest-full-config.xml ---
    @@ -57,6 +57,10 @@
           <global-max-size>1234567</global-max-size>
           <max-disk-usage>37</max-disk-usage>
           <disk-scan-period>123</disk-scan-period>
    +      <analyze-critical-halt>true</analyze-critical-halt>
    --- End diff --
   
    between documentation you name this Critical Analysis, then in config etc you switch round the words to analyse critical. suggest keeping naming constant.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user michaelandrepearce commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131530870
 
    --- Diff: artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ActiveMQServerImpl.java ---
    @@ -477,12 +484,44 @@ public final synchronized void start() throws Exception {
           }
        }
     
    +   @Override
    +   public CriticalAnalyzer getCriticalAnalyzer() {
    +      return this.analyzer;
    +   }
    +
        private void internalStart() throws Exception {
           if (state != SERVER_STATE.STOPPED) {
              logger.debug("Server already started!");
              return;
           }
     
    +      if (!configuration.isAnalyzeCritical()) {
    +         this.analyzer = new EmptyCriticalAnalyzer();
    +      }
    +
    +      /** Calling this for cases where the server was stopped and now is being restarted... failback, etc...*/
    +      this.analyzer.clear();
    +
    +      this.getCriticalAnalyzer().setCheckTime(configuration.getAnalyzeCriticalTimeout()).setTimeout(configuration.getAnalyzeCriticalTimeout());
    --- End diff --
   
    again why not do this in the constructor?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user michaelandrepearce commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131530746
 
    --- Diff: artemis-commons/src/main/java/org/apache/activemq/artemis/utils/critical/CriticalAnalyzerImpl.java ---
    @@ -0,0 +1,182 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements. See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License. You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.activemq.artemis.utils.critical;
    +
    +import java.util.ConcurrentModificationException;
    +import java.util.concurrent.CopyOnWriteArrayList;
    +import java.util.concurrent.Semaphore;
    +import java.util.concurrent.TimeUnit;
    +
    +import org.apache.activemq.artemis.utils.collections.ConcurrentHashSet;
    +import org.jboss.logging.Logger;
    +
    +public class CriticalAnalyzerImpl implements CriticalAnalyzer {
    +
    +   private final Logger logger = Logger.getLogger(CriticalAnalyzer.class);
    +
    +   private volatile long timeout;
    --- End diff --
   
    if this is on hot path, why not use an atomic or have use atomicfieldupdater (same with check time)
    http://normanmaurer.me/blog/2013/10/28/Lesser-known-concurrent-classes-Part-1/



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user michaelandrepearce commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131530954
 
    --- Diff: docs/user-manual/en/critical-analysis.md ---
    @@ -0,0 +1,32 @@
    +# Critical Analysis of the broker
    +
    +There are a few things that can go wrong on a production environment:
    +
    +- Bugs, for more than we try they still happen! We always try to correct them, but that's the only constant in software development.
    +- IO Errors, disks and hardware can go bad
    +- Memory issues, the CPU can go crazy by another process
    +
    +For cases like this, we added a protection to the broker to shut itself down when bad things happen.
    +
    +We measure time response in places like:
    +
    +- Queue delivery (add to the queue)
    +- Journal storage
    +- Paging operations
    +
    +If the response time goes beyond a configured timeout, the broker is considered unstable and an action will be taken to either shutdown the broker or halt the VM.
    +
    +You can use these following configuration options on broker.xml to configure how the critical analysis is performed.
    +
    +
    +Name | Description
    +:--- | :---
    +analyze-critical | Enable or disable the critical analysis (default true)
    +analyze-critical-timeout | Timeout used to do the critical analysis (default 120000 milliseconds)
    +analyze-critical-check-period | Time used to check the response times (default half of analyze-critical-timeout)
    +analyze-critical-halt | Should the VM be halted upon failures (default false)
    --- End diff --
   
    what occurs if you have this to no halt? does it log out, also is it available as a jmx endpoint so users monitoring systems using the exposed jmx endpoints, could be configured to check and alert also to their operational support teams.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user clebertsuconic commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131532564
 
    --- Diff: docs/user-manual/en/critical-analysis.md ---
    @@ -0,0 +1,32 @@
    +# Critical Analysis of the broker
    +
    +There are a few things that can go wrong on a production environment:
    +
    +- Bugs, for more than we try they still happen! We always try to correct them, but that's the only constant in software development.
    +- IO Errors, disks and hardware can go bad
    +- Memory issues, the CPU can go crazy by another process
    +
    +For cases like this, we added a protection to the broker to shut itself down when bad things happen.
    +
    +We measure time response in places like:
    +
    +- Queue delivery (add to the queue)
    +- Journal storage
    +- Paging operations
    +
    +If the response time goes beyond a configured timeout, the broker is considered unstable and an action will be taken to either shutdown the broker or halt the VM.
    +
    +You can use these following configuration options on broker.xml to configure how the critical analysis is performed.
    +
    +
    +Name | Description
    +:--- | :---
    +analyze-critical | Enable or disable the critical analysis (default true)
    +analyze-critical-timeout | Timeout used to do the critical analysis (default 120000 milliseconds)
    +analyze-critical-check-period | Time used to check the response times (default half of analyze-critical-timeout)
    +analyze-critical-halt | Should the VM be halted upon failures (default false)
    --- End diff --
   
    Shutdown. Server.stop.  But there is no guarantee it would work on deadlocks.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user clebertsuconic commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131532611
 
    --- Diff: artemis-server/src/main/java/org/apache/activemq/artemis/core/config/impl/ConfigurationImpl.java ---
    @@ -2064,6 +2072,53 @@ public Configuration setNetworkCheckPing6Command(String command) {
           return this;
        }
     
    +   @Override
    +   public boolean isAnalyzeCritical() {
    +      return analyzeCritical;
    +   }
    +
    +   @Override
    +   public Configuration setAnalyzeCritical(boolean analyzeCritical) {
    +      this.analyzeCritical = analyzeCritical;
    +      return this;
    +   }
    +
    +   @Override
    +   public long getAnalyzeCriticalTimeout() {
    +      return analyzeCriticalTimeout;
    +   }
    +
    +   @Override
    +   public Configuration setAnalyzeCriticalTimeout(long timeout) {
    +      this.analyzeCriticalTimeout = timeout;
    +      return this;
    +   }
    +
    +   @Override
    +   public long getAnalyzeCriticalCheckPeriod() {
    +      if (analyzeCriticalCheckPeriod == ActiveMQDefaultConfiguration.getAnalyzeCriticalCheckPeriod()) {
    --- End diff --
   
    if 0, then the we would calculate the check-period.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user clebertsuconic commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131532619
 
    --- Diff: artemis-server/src/main/java/org/apache/activemq/artemis/core/deployers/impl/FileConfigurationParser.java ---
    @@ -608,6 +608,14 @@ public void parseMainConfig(final Element e, final Configuration config) throws
     
           config.setNetworkCheckPingCommand(getString(e, "network-check-ping-command", config.getNetworkCheckPingCommand(), Validators.NO_CHECK));
     
    +      config.setAnalyzeCritical(getBoolean(e, "analyze-critical", config.isAnalyzeCritical()));
    +
    +      config.setAnalyzeCriticalTimeout(getLong(e, "analyze-critical-timeout", config.getAnalyzeCriticalTimeout(), Validators.GE_ZERO));
    +
    +      config.setAnalyzeCriticalCheckPeriod(getLong(e, "analyze-critical-check-period", config.getAnalyzeCriticalCheckPeriod(), Validators.GE_ZERO));
    --- End diff --
   
    I didn't want to use null here. On this case, check period=0 would mean not set.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user clebertsuconic commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131532629
 
    --- Diff: artemis-commons/src/main/java/org/apache/activemq/artemis/utils/critical/CriticalAnalyzerImpl.java ---
    @@ -0,0 +1,182 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements. See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License. You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.activemq.artemis.utils.critical;
    +
    +import java.util.ConcurrentModificationException;
    +import java.util.concurrent.CopyOnWriteArrayList;
    +import java.util.concurrent.Semaphore;
    +import java.util.concurrent.TimeUnit;
    +
    +import org.apache.activemq.artemis.utils.collections.ConcurrentHashSet;
    +import org.jboss.logging.Logger;
    +
    +public class CriticalAnalyzerImpl implements CriticalAnalyzer {
    +
    +   private final Logger logger = Logger.getLogger(CriticalAnalyzer.class);
    +
    +   private volatile long timeout;
    +
    +   private volatile long checkTime;
    +
    +   @Override
    +   public void clear() {
    +      actions.clear();
    +      components.clear();
    +   }
    +
    +   private CopyOnWriteArrayList<CriticalAction> actions = new CopyOnWriteArrayList<>();
    +
    +   private Thread thread;
    +
    +   private final Semaphore running = new Semaphore(1);
    +
    +   private final ConcurrentHashSet<CriticalComponent> components = new ConcurrentHashSet<>();
    +
    +   @Override
    +   public void add(CriticalComponent component) {
    +      components.add(component);
    +   }
    +
    +   @Override
    +   public void remove(CriticalComponent component) {
    +      components.remove(component);
    +   }
    +
    +   @Override
    +   public CriticalAnalyzer setCheckTime(long timeout) {
    +      this.checkTime = timeout;
    +      return this;
    +   }
    +
    +   @Override
    +   public long getCheckTime() {
    +      if (checkTime == 0) {
    +         checkTime = getTimeout() / 2;
    +      }
    +      return checkTime;
    +   }
    +
    +   @Override
    +   public CriticalAnalyzer setTimeout(long timeout) {
    +      this.timeout = timeout;
    +      return this;
    +   }
    +
    +   @Override
    +   public long getTimeout() {
    +      if (timeout == 0) {
    +         timeout = TimeUnit.MINUTES.toMillis(2);
    +      }
    +      return timeout;
    +   }
    +
    +   @Override
    +   public CriticalAnalyzer addAction(CriticalAction action) {
    +      this.actions.add(action);
    +      return this;
    +   }
    +
    +   @Override
    +   public void check() {
    +      boolean retry = true;
    +      while (retry) {
    +         try {
    +            for (CriticalComponent component : components) {
    +
    +               int pathReturned = component.isExpired(timeout);
    +               if (pathReturned >= 0) {
    +                  fireAction(component, pathReturned);
    +                  // no need to keep running if there's already a component failed
    +                  return;
    +               }
    +            }
    +            retry = false; // got to the end of the list, no need to retry
    +         } catch (ConcurrentModificationException dontCare) {
    +            // lets retry on the loop
    +         }
    +      }
    +   }
    +
    +   private void fireAction(CriticalComponent component, int path) {
    +      for (CriticalAction action: actions) {
    +         try {
    +            action.run(component, path);
    +         } catch (Throwable e) {
    +            logger.warn(e.getMessage(), e);
    +         }
    +      }
    +   }
    +
    +   @Override
    +   public void start() {
    +
    +      if (!running.tryAcquire()) {
    +         // already running
    +         return;
    +      }
    +
    +      // we are not using any Thread Pool or any Scheduled Executors from the ArtemisServer
    +      // as that would defeat the purpose,
    +      // as in any deadlocks the schedulers may be starving for something not responding fast enough
    +      thread = new Thread("Artemis Critical Analyzer") {
    +         @Override
    +         public void run() {
    +            try {
    +               while (true) {
    +                  if (running.tryAcquire(getCheckTime(), TimeUnit.MILLISECONDS)) {
    +                     running.release();
    +                     // this means that the server has been stopped as we could acquire the semaphore... returning now
    +                     break;
    +                  }
    +                  check();
    +               }
    +            } catch (InterruptedException interrupted) {
    +               // i will just leave on that case
    +            }
    +         }
    +      };
    +
    +      thread.setDaemon(true);
    +
    +      thread.start();
    +   }
    +
    +   @Override
    +   public void stop() {
    --- End diff --
   
    this stop here is like we have in many other components.
   
   
    start means, start the thread.
    stop means shutdown the thread.
   
    I'm not sure what you mean?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user clebertsuconic commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131532658
 
    --- Diff: artemis-commons/src/main/java/org/apache/activemq/artemis/utils/critical/CriticalAnalyzerImpl.java ---
    @@ -0,0 +1,182 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements. See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License. You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.activemq.artemis.utils.critical;
    +
    +import java.util.ConcurrentModificationException;
    +import java.util.concurrent.CopyOnWriteArrayList;
    +import java.util.concurrent.Semaphore;
    +import java.util.concurrent.TimeUnit;
    +
    +import org.apache.activemq.artemis.utils.collections.ConcurrentHashSet;
    +import org.jboss.logging.Logger;
    +
    +public class CriticalAnalyzerImpl implements CriticalAnalyzer {
    +
    +   private final Logger logger = Logger.getLogger(CriticalAnalyzer.class);
    +
    +   private volatile long timeout;
    --- End diff --
   
    I don't need it to be atomic really. as soon as the dead lock stops updating this.. it would be enough to me to just have a volatile here.
   
   
    The only overhead is the call to System.currentTimeMillis(); I did some tests and didn't see any regressions.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user clebertsuconic commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131532685
 
    --- Diff: artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/ActiveMQServerImpl.java ---
    @@ -477,12 +484,44 @@ public final synchronized void start() throws Exception {
           }
        }
     
    +   @Override
    +   public CriticalAnalyzer getCriticalAnalyzer() {
    +      return this.analyzer;
    +   }
    +
        private void internalStart() throws Exception {
           if (state != SERVER_STATE.STOPPED) {
              logger.debug("Server already started!");
              return;
           }
     
    +      if (!configuration.isAnalyzeCritical()) {
    --- End diff --
   
    I thought the configuration was being parsed inside ActiveMQServerImpl. (Bad memory, I forget about stuff months after working on stuff :) )... I will move it to constructor.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user clebertsuconic commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131532700
 
    --- Diff: docs/user-manual/en/critical-analysis.md ---
    @@ -0,0 +1,32 @@
    +# Critical Analysis of the broker
    +
    +There are a few things that can go wrong on a production environment:
    +
    +- Bugs, for more than we try they still happen! We always try to correct them, but that's the only constant in software development.
    +- IO Errors, disks and hardware can go bad
    +- Memory issues, the CPU can go crazy by another process
    +
    +For cases like this, we added a protection to the broker to shut itself down when bad things happen.
    +
    +We measure time response in places like:
    +
    +- Queue delivery (add to the queue)
    +- Journal storage
    +- Paging operations
    +
    +If the response time goes beyond a configured timeout, the broker is considered unstable and an action will be taken to either shutdown the broker or halt the VM.
    +
    +You can use these following configuration options on broker.xml to configure how the critical analysis is performed.
    +
    +
    +Name | Description
    +:--- | :---
    +analyze-critical | Enable or disable the critical analysis (default true)
    +analyze-critical-timeout | Timeout used to do the critical analysis (default 120000 milliseconds)
    +analyze-critical-check-period | Time used to check the response times (default half of analyze-critical-timeout)
    +analyze-critical-halt | Should the VM be halted upon failures (default false)
    --- End diff --
   
    The idea here was to add monitoring on internal objects similar to what we do on pings / pongs through the protocols. I didn't want to make it any more complex than needed.
   
    This subject could be raised to become an independent project if you keep investing here.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user michaelandrepearce commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131534288
 
    --- Diff: artemis-commons/src/main/java/org/apache/activemq/artemis/utils/critical/CriticalAnalyzerImpl.java ---
    @@ -0,0 +1,182 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements. See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License. You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.activemq.artemis.utils.critical;
    +
    +import java.util.ConcurrentModificationException;
    +import java.util.concurrent.CopyOnWriteArrayList;
    +import java.util.concurrent.Semaphore;
    +import java.util.concurrent.TimeUnit;
    +
    +import org.apache.activemq.artemis.utils.collections.ConcurrentHashSet;
    +import org.jboss.logging.Logger;
    +
    +public class CriticalAnalyzerImpl implements CriticalAnalyzer {
    +
    +   private final Logger logger = Logger.getLogger(CriticalAnalyzer.class);
    +
    +   private volatile long timeout;
    +
    +   private volatile long checkTime;
    +
    +   @Override
    +   public void clear() {
    +      actions.clear();
    +      components.clear();
    +   }
    +
    +   private CopyOnWriteArrayList<CriticalAction> actions = new CopyOnWriteArrayList<>();
    +
    +   private Thread thread;
    +
    +   private final Semaphore running = new Semaphore(1);
    +
    +   private final ConcurrentHashSet<CriticalComponent> components = new ConcurrentHashSet<>();
    +
    +   @Override
    +   public void add(CriticalComponent component) {
    +      components.add(component);
    +   }
    +
    +   @Override
    +   public void remove(CriticalComponent component) {
    +      components.remove(component);
    +   }
    +
    +   @Override
    +   public CriticalAnalyzer setCheckTime(long timeout) {
    +      this.checkTime = timeout;
    +      return this;
    +   }
    +
    +   @Override
    +   public long getCheckTime() {
    +      if (checkTime == 0) {
    +         checkTime = getTimeout() / 2;
    +      }
    +      return checkTime;
    +   }
    +
    +   @Override
    +   public CriticalAnalyzer setTimeout(long timeout) {
    +      this.timeout = timeout;
    +      return this;
    +   }
    +
    +   @Override
    +   public long getTimeout() {
    +      if (timeout == 0) {
    +         timeout = TimeUnit.MINUTES.toMillis(2);
    +      }
    +      return timeout;
    +   }
    +
    +   @Override
    +   public CriticalAnalyzer addAction(CriticalAction action) {
    +      this.actions.add(action);
    +      return this;
    +   }
    +
    +   @Override
    +   public void check() {
    +      boolean retry = true;
    +      while (retry) {
    +         try {
    +            for (CriticalComponent component : components) {
    +
    +               int pathReturned = component.isExpired(timeout);
    +               if (pathReturned >= 0) {
    +                  fireAction(component, pathReturned);
    +                  // no need to keep running if there's already a component failed
    +                  return;
    +               }
    +            }
    +            retry = false; // got to the end of the list, no need to retry
    +         } catch (ConcurrentModificationException dontCare) {
    +            // lets retry on the loop
    +         }
    +      }
    +   }
    +
    +   private void fireAction(CriticalComponent component, int path) {
    +      for (CriticalAction action: actions) {
    +         try {
    +            action.run(component, path);
    +         } catch (Throwable e) {
    +            logger.warn(e.getMessage(), e);
    +         }
    +      }
    +   }
    +
    +   @Override
    +   public void start() {
    +
    +      if (!running.tryAcquire()) {
    +         // already running
    +         return;
    +      }
    +
    +      // we are not using any Thread Pool or any Scheduled Executors from the ArtemisServer
    +      // as that would defeat the purpose,
    +      // as in any deadlocks the schedulers may be starving for something not responding fast enough
    +      thread = new Thread("Artemis Critical Analyzer") {
    +         @Override
    +         public void run() {
    +            try {
    +               while (true) {
    +                  if (running.tryAcquire(getCheckTime(), TimeUnit.MILLISECONDS)) {
    +                     running.release();
    +                     // this means that the server has been stopped as we could acquire the semaphore... returning now
    +                     break;
    +                  }
    +                  check();
    +               }
    +            } catch (InterruptedException interrupted) {
    +               // i will just leave on that case
    +            }
    +         }
    +      };
    +
    +      thread.setDaemon(true);
    +
    +      thread.start();
    +   }
    +
    +   @Override
    +   public void stop() {
    --- End diff --
   
    From what I understand start , begins some timer, and stop ends it. If the timer > x action y will occur.
   
    As such if someone in code accidentally forgets to call stop or forgets to put it in a finally block so that when exceptions occur it also stops the timer.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user michaelandrepearce commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131534303
 
    --- Diff: docs/user-manual/en/critical-analysis.md ---
    @@ -0,0 +1,32 @@
    +# Critical Analysis of the broker
    +
    +There are a few things that can go wrong on a production environment:
    +
    +- Bugs, for more than we try they still happen! We always try to correct them, but that's the only constant in software development.
    +- IO Errors, disks and hardware can go bad
    +- Memory issues, the CPU can go crazy by another process
    +
    +For cases like this, we added a protection to the broker to shut itself down when bad things happen.
    +
    +We measure time response in places like:
    +
    +- Queue delivery (add to the queue)
    +- Journal storage
    +- Paging operations
    +
    +If the response time goes beyond a configured timeout, the broker is considered unstable and an action will be taken to either shutdown the broker or halt the VM.
    +
    +You can use these following configuration options on broker.xml to configure how the critical analysis is performed.
    +
    +
    +Name | Description
    +:--- | :---
    +analyze-critical | Enable or disable the critical analysis (default true)
    +analyze-critical-timeout | Timeout used to do the critical analysis (default 120000 milliseconds)
    +analyze-critical-check-period | Time used to check the response times (default half of analyze-critical-timeout)
    +analyze-critical-halt | Should the VM be halted upon failures (default false)
    --- End diff --
   
    I was more saying:
   
    how does an outside observer view the state. Eg many people rely on logs or JMX


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[GitHub] activemq-artemis pull request #1443: ARTEMIS-1324 Deadlock detection and hea...

franz1981
In reply to this post by franz1981
Github user michaelandrepearce commented on a diff in the pull request:

    https://github.com/apache/activemq-artemis/pull/1443#discussion_r131534309
 
    --- Diff: artemis-commons/src/main/java/org/apache/activemq/artemis/utils/critical/CriticalAnalyzerImpl.java ---
    @@ -0,0 +1,182 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements. See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License. You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.activemq.artemis.utils.critical;
    +
    +import java.util.ConcurrentModificationException;
    +import java.util.concurrent.CopyOnWriteArrayList;
    +import java.util.concurrent.Semaphore;
    +import java.util.concurrent.TimeUnit;
    +
    +import org.apache.activemq.artemis.utils.collections.ConcurrentHashSet;
    +import org.jboss.logging.Logger;
    +
    +public class CriticalAnalyzerImpl implements CriticalAnalyzer {
    +
    +   private final Logger logger = Logger.getLogger(CriticalAnalyzer.class);
    +
    +   private volatile long timeout;
    --- End diff --
   
    Is this one of these values not being updated by one thread and read by another then? I may have miss understood the code.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [hidden email] or file a JIRA ticket
with INFRA.
---
123
Loading...